feat:utils

This commit is contained in:
guorong.zheng 2024-03-15 11:08:46 +08:00
parent 19ea688903
commit d5b6d34493
2 changed files with 201 additions and 169 deletions

193
main.py

@ -3,22 +3,20 @@ from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
import re
from selenium_stealth import stealth
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import re
import datetime
from utils import (
getChannelItems,
removeFile,
outputTxt,
getUrlInfo,
compareSpeedAndResolution,
getTotalUrls,
)
class GetSource:
def __init__(self):
self.driver = self.setup_driver()
self.main()
class UpdateSource:
def setup_driver(self):
options = webdriver.ChromeOptions()
@ -39,136 +37,11 @@ class GetSource:
)
return driver
def getChannelItems(self):
# Open the source file and read all lines.
with open(config.source_file, "r") as f:
lines = f.readlines()
# Create a dictionary to store the channels.
channels = {}
current_channel = ""
pattern = r"^(.*?),(?!#genre#)(.*?)$"
for line in lines:
line = line.strip()
if "#genre#" in line:
# This is a new channel, create a new key in the dictionary.
current_channel = line.split(",")[0]
channels[current_channel] = {}
else:
# This is a url, add it to the list of urls for the current channel.
match = re.search(pattern, line)
if match:
if match.group(1) not in channels[current_channel]:
channels[current_channel][match.group(1)] = [match.group(2)]
else:
channels[current_channel][match.group(1)].append(match.group(2))
return channels
async def getSpeed(self, url):
async with aiohttp.ClientSession() as session:
start = time.time()
try:
async with session.get(url, timeout=5) as response:
resStatus = response.status
except:
return url, float("inf")
end = time.time()
if resStatus == 200:
return url, end - start
else:
return url, float("inf")
async def compareSpeedAndResolution(self, infoList):
response_times = await asyncio.gather(
*(self.getSpeed(url) for url, _, _ in infoList)
)
valid_responses = [
(info, rt)
for info, rt in zip(infoList, response_times)
if rt[1] != float("inf")
]
def extract_resolution(resolution_str):
numbers = re.findall(r"\d+x\d+", resolution_str)
if numbers:
width, height = map(int, numbers[0].split("x"))
return width * height
else:
return 0
default_response_time_weight = 0.5
default_resolution_weight = 0.5
response_time_weight = getattr(
config, "response_time_weight", default_response_time_weight
)
resolution_weight = getattr(
config, "resolution_weight", default_resolution_weight
)
# Check if weights are valid
if not (
0 <= response_time_weight <= 1
and 0 <= resolution_weight <= 1
and response_time_weight + resolution_weight == 1
):
response_time_weight = default_response_time_weight
resolution_weight = default_resolution_weight
def combined_key(item):
(_, _, resolution), response_time = item
resolution_value = extract_resolution(resolution) if resolution else 0
return (
-(response_time_weight * response_time[1])
+ resolution_weight * resolution_value
)
sorted_res = sorted(valid_responses, key=combined_key)
return sorted_res
def removeFile(self):
if os.path.exists(config.final_file):
os.remove(config.final_file)
def outputTxt(self, cate, channelUrls):
# Update the final file.
with open(config.final_file, "a") as f:
f.write(cate + ",#genre#\n")
for name, urls in channelUrls.items():
for url in urls:
if url is not None:
f.write(name + "," + url + "\n")
f.write("\n")
def filterByDate(self, data):
default_recent_days = 60
use_recent_days = getattr(config, "recent_days", 60)
if (
not isinstance(use_recent_days, int)
or use_recent_days <= 0
or use_recent_days > 365
):
use_recent_days = default_recent_days
start_date = datetime.datetime.now() - datetime.timedelta(days=use_recent_days)
recent_data = []
for (url, date, resolution), response_time in data:
if date:
date = datetime.datetime.strptime(date, "%m-%d-%Y")
if date >= start_date:
recent_data.append(((url, date, resolution), response_time))
return recent_data
def getTotalUrls(self, data):
total_urls = []
if len(data) > config.urls_limit:
total_urls = [
url for (url, _, _), _ in self.filterByDate(data)[: config.urls_limit]
]
else:
total_urls = [url for (url, _, _), _ in data]
return list(dict.fromkeys(total_urls))
def __init__(self):
self.driver = self.setup_driver()
async def visitPage(self, channelItems):
self.removeFile()
removeFile()
for cate, channelObj in channelItems.items():
channelUrls = {}
for name in channelObj.keys():
@ -198,45 +71,27 @@ class GetSource:
):
break
for result in results:
m3u8_div = result.find("div", class_="m3u8")
url = m3u8_div.text.strip() if m3u8_div else None
info_div = (
m3u8_div.find_next_sibling("div") if m3u8_div else None
)
date = resolution = None
if info_div:
info_text = info_div.text.strip()
date, resolution = (
(
info_text.partition(" ")[0]
if info_text.partition(" ")[0]
else None
),
(
info_text.partition(" ")[2].partition("")[2]
if info_text.partition(" ")[2].partition("")[2]
else None
),
)
infoList.append((url, date, resolution))
try:
url, date, resolution = getUrlInfo(result)
if url:
infoList.append((url, date, resolution))
except Exception as e:
print(f"Error on result {result}: {e}")
continue
except Exception as e:
print(f"Error on page {page}: {e}")
continue
try:
sorted_data = await self.compareSpeedAndResolution(
infoList
) # Sort by speed and resolution
channelUrls[name] = (
self.getTotalUrls(sorted_data) or channelObj[name]
) # Get the total urls with filter by date and limit
sorted_data = await compareSpeedAndResolution(infoList)
channelUrls[name] = getTotalUrls(sorted_data) or channelObj[name]
except Exception as e:
print(f"Error on sorting: {e}")
continue
self.outputTxt(cate, channelUrls)
outputTxt(cate, channelUrls)
await asyncio.sleep(1)
def main(self):
asyncio.run(self.visitPage(self.getChannelItems()))
asyncio.run(self.visitPage(getChannelItems()))
GetSource()
UpdateSource().main()

177
utils.py Normal file

@ -0,0 +1,177 @@
import aiohttp
import asyncio
import config
import time
import re
import datetime
import os
def getChannelItems():
"""
Get the channel items from the source file
"""
# Open the source file and read all lines.
with open(config.source_file, "r") as f:
lines = f.readlines()
# Create a dictionary to store the channels.
channels = {}
current_channel = ""
pattern = r"^(.*?),(?!#genre#)(.*?)$"
for line in lines:
line = line.strip()
if "#genre#" in line:
# This is a new channel, create a new key in the dictionary.
current_channel = line.split(",")[0]
channels[current_channel] = {}
else:
# This is a url, add it to the list of urls for the current channel.
match = re.search(pattern, line)
if match:
if match.group(1) not in channels[current_channel]:
channels[current_channel][match.group(1)] = [match.group(2)]
else:
channels[current_channel][match.group(1)].append(match.group(2))
return channels
def removeFile():
"""
Remove the old final file
"""
if os.path.exists(config.final_file):
os.remove(config.final_file)
def outputTxt(cate, channelUrls):
"""
Update the final file
"""
with open(config.final_file, "a") as f:
f.write(cate + ",#genre#\n")
for name, urls in channelUrls.items():
for url in urls:
if url is not None:
f.write(name + "," + url + "\n")
f.write("\n")
def getUrlInfo(result):
"""
Get the url, date and resolution
"""
m3u8_div = result.find("div", class_="m3u8")
url = m3u8_div.text.strip() if m3u8_div else None
info_div = m3u8_div.find_next_sibling("div") if m3u8_div else None
date = resolution = None
if info_div:
info_text = info_div.text.strip()
date, resolution = (
(info_text.partition(" ")[0] if info_text.partition(" ")[0] else None),
(
info_text.partition(" ")[2].partition("")[2]
if info_text.partition(" ")[2].partition("")[2]
else None
),
)
return url, date, resolution
async def getSpeed(url):
"""
Get the speed of the url
"""
async with aiohttp.ClientSession() as session:
start = time.time()
try:
async with session.get(url, timeout=5) as response:
resStatus = response.status
except:
return url, float("inf")
end = time.time()
if resStatus == 200:
return url, end - start
else:
return url, float("inf")
async def compareSpeedAndResolution(infoList):
"""
Sort by speed and resolution
"""
response_times = await asyncio.gather(*(getSpeed(url) for url, _, _ in infoList))
valid_responses = [
(info, rt)
for info, rt in zip(infoList, response_times)
if rt[1] != float("inf")
]
def extract_resolution(resolution_str):
numbers = re.findall(r"\d+x\d+", resolution_str)
if numbers:
width, height = map(int, numbers[0].split("x"))
return width * height
else:
return 0
default_response_time_weight = 0.5
default_resolution_weight = 0.5
response_time_weight = getattr(
config, "response_time_weight", default_response_time_weight
)
resolution_weight = getattr(config, "resolution_weight", default_resolution_weight)
# Check if weights are valid
if not (
0 <= response_time_weight <= 1
and 0 <= resolution_weight <= 1
and response_time_weight + resolution_weight == 1
):
response_time_weight = default_response_time_weight
resolution_weight = default_resolution_weight
def combined_key(item):
(_, _, resolution), response_time = item
resolution_value = extract_resolution(resolution) if resolution else 0
return (
-(response_time_weight * response_time[1])
+ resolution_weight * resolution_value
)
sorted_res = sorted(valid_responses, key=combined_key)
return sorted_res
def filterByDate(data):
"""
Filter by date and limit
"""
default_recent_days = 60
use_recent_days = getattr(config, "recent_days", 60)
if (
not isinstance(use_recent_days, int)
or use_recent_days <= 0
or use_recent_days > 365
):
use_recent_days = default_recent_days
start_date = datetime.datetime.now() - datetime.timedelta(days=use_recent_days)
recent_data = []
for (url, date, resolution), response_time in data:
if date:
date = datetime.datetime.strptime(date, "%m-%d-%Y")
if date >= start_date:
recent_data.append(((url, date, resolution), response_time))
return recent_data
def getTotalUrls(data):
"""
Get the total urls with filter by date and limit
"""
total_urls = []
if len(data) > config.urls_limit:
total_urls = [url for (url, _, _), _ in filterByDate(data)[: config.urls_limit]]
else:
total_urls = [url for (url, _, _), _ in data]
return list(dict.fromkeys(total_urls))