Merge pull request #96 from Guovin/dev

Release: v1.1.0
2024-04-26 17:21:01 +08:00 · 2024-04-26 17:21:01 +08:00 · e9311f96ab
commit e9311f96ab
parent 9e165952c4 ae41c543f6
9 changed files with 177 additions and 75 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,11 @@
 # 更新日志（Changelog）

+## v1.1.0
+
+### 2024/4/26
+
+- 新增自定义接口获取源，配置项为 extend_base_urls（#56）（Added custom interface for source acquisition, the configuration item is extend_base_urls (#56)）
+
 ## v1.0.9

 ### 2024/4/25
--- a/README-EN.md
+++ b/README-EN.md
@ -16,23 +16,25 @@ Customize channel menus and automatically obtain and update the latest live sour
 - Ensure update timeliness, configure to retrieve interfaces updated within a recent time range
 - Can filter ipv4, ipv6 interfaces
 - Blacklist feature: Interface domain and keywords
+- Customize the source of interface acquisition

 ## Config

-| Configuration Item     | Default Value      | Description                                                                                                        |
-| ---------------------- | ------------------ | ------------------------------------------------------------------------------------------------------------------ |
-| source_file            | "demo.txt"         | Template file name                                                                                                 |
-| final_file             | "result.txt"       | Generated file name                                                                                                |
-| favorite_list          | ["CCTV1","CCTV13"] | List of favorite channel names (used only to distinguish from regular channels, custom page retrieval quantity)    |
-| favorite_page_num      | 5                  | Page retrieval quantity for favorite channels                                                                      |
-| default_page_num       | 3                  | Page retrieval quantity for regular channels                                                                       |
-| urls_limit             | 10                 | Number of interfaces per channel                                                                                   |
-| response_time_weight   | 0.5                | Response time weight value (the sum of all weight values should be 1)                                              |
-| resolution_weight      | 0.5                | Resolution weight value (the sum of all weight values should be 1)                                                 |
-| recent_days            | 30                 | Retrieve interfaces updated within a recent time range (in days), reducing appropriately can avoid matching issues |
-| ipv_type               | "ipv4"             | The type of interface in the generated result, optional values: "ipv4", "ipv6", "all"                              |
-| domain_blacklist       | ["epg.pw"]         | Interface domain blacklist, used to filter out interfaces with low-quality, ad-inclusive domains                   |
-| url_keywords_blacklist | []                 | Interface keyword blacklist, used to filter out interfaces containing specific characters                          |
+| Configuration Item     | Default Value                                                                                                               | Description                                                                                                                           |
+| ---------------------- | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
+| source_file            | "demo.txt"                                                                                                                  | Template file name                                                                                                                    |
+| final_file             | "result.txt"                                                                                                                | Generated file name                                                                                                                   |
+| favorite_list          | ["CCTV1","CCTV13"]                                                                                                          | List of favorite channel names (used only to distinguish from regular channels, custom page retrieval quantity)                       |
+| favorite_page_num      | 5                                                                                                                           | Page retrieval quantity for favorite channels                                                                                         |
+| default_page_num       | 3                                                                                                                           | Page retrieval quantity for regular channels                                                                                          |
+| urls_limit             | 10                                                                                                                          | Number of interfaces per channel                                                                                                      |
+| response_time_weight   | 0.5                                                                                                                         | Response time weight value (the sum of all weight values should be 1)                                                                 |
+| resolution_weight      | 0.5                                                                                                                         | Resolution weight value (the sum of all weight values should be 1)                                                                    |
+| recent_days            | 30                                                                                                                          | Retrieve interfaces updated within a recent time range (in days), reducing appropriately can avoid matching issues                    |
+| ipv_type               | "ipv4"                                                                                                                      | The type of interface in the generated result, optional values: "ipv4", "ipv6", "all"                                                 |
+| domain_blacklist       | ["epg.pw"]                                                                                                                  | Interface domain blacklist, used to filter out interfaces with low-quality, ad-inclusive domains                                      |
+| url_keywords_blacklist | []                                                                                                                          | Interface keyword blacklist, used to filter out interfaces containing specific characters                                             |
+| extend_base_urls       | ["https://m3u.ibert.me/txt/fmml_dv6.txt",<br>"https://m3u.ibert.me/txt/o_cn.txt",<br>"https://m3u.ibert.me/txt/j_iptv.txt"] | The source of interface acquisition, currently only compatible with specific content formats and fuzzy matching of some channel names |

 ## Quick Start

--- a/README.md
+++ b/README.md
@ -16,23 +16,25 @@
 - 保证更新时效性，配置获取最近时间范围内更新的接口
 - 可过滤 ipv4、ipv6 接口
 - 黑名单功能：接口域名与关键字
+- 自定义接口获取源

 ## 配置

-| 配置项                 | 默认值             | 描述                                                               |
-| ---------------------- | ------------------ | ------------------------------------------------------------------ |
-| source_file            | "demo.txt"         | 模板文件名称                                                       |
-| final_file             | "result.txt"       | 生成文件名称                                                       |
-| favorite_list          | ["CCTV1","CCTV13"] | 关注频道名称列表（仅用于与常规频道区分，自定义获取分页数量）       |
-| favorite_page_num      | 5                  | 关注频道获取分页数量                                               |
-| default_page_num       | 3                  | 常规频道获取分页数量                                               |
-| urls_limit             | 10                 | 单个频道接口数量                                                   |
-| response_time_weight   | 0.5                | 响应时间权重值（所有权重值总和应为 1）                             |
-| resolution_weight      | 0.5                | 分辨率权重值 （所有权重值总和应为 1）                              |
-| recent_days            | 30                 | 获取最近时间范围内更新的接口（单位天），适当减小可避免出现匹配问题 |
-| ipv_type               | "ipv4"             | 生成结果中接口的类型，可选值："ipv4"、"ipv6"、"all"                |
-| domain_blacklist       | ["epg.pw"]         | 接口域名黑名单，用于过滤低质量含广告类域名的接口                   |
-| url_keywords_blacklist | []                 | 接口关键字黑名单，用于过滤含特定字符的接口                         |
+| 配置项                 | 默认值                                                                                                                      | 描述                                                               |
+| ---------------------- | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------ |
+| source_file            | "demo.txt"                                                                                                                  | 模板文件名称                                                       |
+| final_file             | "result.txt"                                                                                                                | 生成文件名称                                                       |
+| favorite_list          | ["CCTV1","CCTV13"]                                                                                                          | 关注频道名称列表（仅用于与常规频道区分，自定义获取分页数量）       |
+| favorite_page_num      | 5                                                                                                                           | 关注频道获取分页数量                                               |
+| default_page_num       | 3                                                                                                                           | 常规频道获取分页数量                                               |
+| urls_limit             | 10                                                                                                                          | 单个频道接口数量                                                   |
+| response_time_weight   | 0.5                                                                                                                         | 响应时间权重值（所有权重值总和应为 1）                             |
+| resolution_weight      | 0.5                                                                                                                         | 分辨率权重值 （所有权重值总和应为 1）                              |
+| recent_days            | 30                                                                                                                          | 获取最近时间范围内更新的接口（单位天），适当减小可避免出现匹配问题 |
+| ipv_type               | "ipv4"                                                                                                                      | 生成结果中接口的类型，可选值："ipv4"、"ipv6"、"all"                |
+| domain_blacklist       | ["epg.pw"]                                                                                                                  | 接口域名黑名单，用于过滤低质量含广告类域名的接口                   |
+| url_keywords_blacklist | []                                                                                                                          | 接口关键字黑名单，用于过滤含特定字符的接口                         |
+| extend_base_urls       | ["https://m3u.ibert.me/txt/fmml_dv6.txt",<br>"https://m3u.ibert.me/txt/o_cn.txt",<br>"https://m3u.ibert.me/txt/j_iptv.txt"] | 接口获取源，目前仅兼容特定内容格式与部分频道名称的模糊匹配         |

 ## 快速上手

--- a/config.py
+++ b/config.py
@ -24,3 +24,8 @@ recent_days = 30
 ipv_type = "ipv4"
 domain_blacklist = ["epg.pw"]
 url_keywords_blacklist = []
+extend_base_urls = [
+    "https://m3u.ibert.me/txt/fmml_dv6.txt",
+    "https://m3u.ibert.me/txt/o_cn.txt",
+    "https://m3u.ibert.me/txt/j_iptv.txt",
+]
--- a/docs/tutorial-EN.md
+++ b/docs/tutorial-EN.md
@ -57,20 +57,21 @@ Similar to editing the template, modify the running configuration

 Adjust the configuration as needed. Below is the default configuration explanation:

-| Configuration Item     | Default Value      | Description                                                                                                        |
-| ---------------------- | ------------------ | ------------------------------------------------------------------------------------------------------------------ |
-| source_file            | "demo.txt"         | Template file name                                                                                                 |
-| final_file             | "result.txt"       | Generated file name                                                                                                |
-| favorite_list          | ["CCTV1","CCTV13"] | List of favorite channel names (used only to distinguish from regular channels, custom page retrieval quantity)    |
-| favorite_page_num      | 5                  | Page retrieval quantity for favorite channels                                                                      |
-| default_page_num       | 3                  | Page retrieval quantity for regular channels                                                                       |
-| urls_limit             | 10                 | Number of interfaces per channel                                                                                   |
-| response_time_weight   | 0.5                | Response time weight value (the sum of all weight values should be 1)                                              |
-| resolution_weight      | 0.5                | Resolution weight value (the sum of all weight values should be 1)                                                 |
-| recent_days            | 30                 | Retrieve interfaces updated within a recent time range (in days), reducing appropriately can avoid matching issues |
-| ipv_type               | "ipv4"             | The type of interface in the generated result, optional values: "ipv4", "ipv6", "all"                              |
-| domain_blacklist       | ["epg.pw"]         | Interface domain blacklist, used to filter out interfaces with low-quality, ad-inclusive domains                   |
-| url_keywords_blacklist | []                 | Interface keyword blacklist, used to filter out interfaces containing specific characters                          |
+| Configuration Item     | Default Value                                                                                                               | Description                                                                                                                           |
+| ---------------------- | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
+| source_file            | "demo.txt"                                                                                                                  | Template file name                                                                                                                    |
+| final_file             | "result.txt"                                                                                                                | Generated file name                                                                                                                   |
+| favorite_list          | ["CCTV1","CCTV13"]                                                                                                          | List of favorite channel names (used only to distinguish from regular channels, custom page retrieval quantity)                       |
+| favorite_page_num      | 5                                                                                                                           | Page retrieval quantity for favorite channels                                                                                         |
+| default_page_num       | 3                                                                                                                           | Page retrieval quantity for regular channels                                                                                          |
+| urls_limit             | 10                                                                                                                          | Number of interfaces per channel                                                                                                      |
+| response_time_weight   | 0.5                                                                                                                         | Response time weight value (the sum of all weight values should be 1)                                                                 |
+| resolution_weight      | 0.5                                                                                                                         | Resolution weight value (the sum of all weight values should be 1)                                                                    |
+| recent_days            | 30                                                                                                                          | Retrieve interfaces updated within a recent time range (in days), reducing appropriately can avoid matching issues                    |
+| ipv_type               | "ipv4"                                                                                                                      | The type of interface in the generated result, optional values: "ipv4", "ipv6", "all"                                                 |
+| domain_blacklist       | ["epg.pw"]                                                                                                                  | Interface domain blacklist, used to filter out interfaces with low-quality, ad-inclusive domains                                      |
+| url_keywords_blacklist | []                                                                                                                          | Interface keyword blacklist, used to filter out interfaces containing specific characters                                             |
+| extend_base_urls       | ["https://m3u.ibert.me/txt/fmml_dv6.txt",<br>"https://m3u.ibert.me/txt/o_cn.txt",<br>"https://m3u.ibert.me/txt/j_iptv.txt"] | The source of interface acquisition, currently only compatible with specific content formats and fuzzy matching of some channel names |

 ## Step 4: Run Updates Locally (Recommended, Stable, Supports a large number of channel updates)

--- a/docs/tutorial.md
+++ b/docs/tutorial.md
@ -57,7 +57,7 @@

 按照您的需要适当调整配置，以下是默认配置说明
 | 配置项 | 默认值 | 描述 |
-| -------------------- | ------------------ | ------------------------------------------------------------------ |
+| ---------------------- | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------ |
 | source_file | "demo.txt" | 模板文件名称 |
 | final_file | "result.txt" | 生成文件名称 |
 | favorite_list | ["CCTV1","CCTV13"] | 关注频道名称列表（仅用于与常规频道区分，自定义获取分页数量） |
@ -70,6 +70,7 @@
 | ipv_type | "ipv4" | 生成结果中接口的类型，可选值："ipv4"、"ipv6"、"all" |
 | domain_blacklist | ["epg.pw"] | 接口域名黑名单，用于过滤低质量含广告类域名的接口 |
 | url_keywords_blacklist | [] | 接口关键字黑名单，用于过滤含特定字符的接口 |
+| extend_base_urls | ["https://m3u.ibert.me/txt/fmml_dv6.txt",<br>"https://m3u.ibert.me/txt/o_cn.txt",<br>"https://m3u.ibert.me/txt/j_iptv.txt"] | 接口获取源，目前仅兼容特定内容格式与部分频道名称的模糊匹配 |

 ## 步骤四：本地运行更新（推荐，稳定，支持大量频道更新）

--- a/main.py
+++ b/main.py
@ -14,13 +14,12 @@ from utils import (
    updateChannelUrlsTxt,
    updateFile,
    getUrlInfo,
-    compareSpeedAndResolution,
+    sortUrlsBySpeedAndResolution,
    getTotalUrls,
-    checkUrlIPVType,
-    checkByDomainBlacklist,
-    checkByURLKeywordsBlacklist,
    filterUrlsByPatterns,
    useAccessibleUrl,
+    getChannelsByExtendBaseUrls,
+    checkUrlByPatterns,
 )
 import logging
 from logging.handlers import RotatingFileHandler
@ -61,9 +60,14 @@ class UpdateSource:
        self.driver = self.setup_driver()

    async def visitPage(self, channelItems):
-        total_channels = sum(len(channelObj) for _, channelObj in channelItems.items())
+        channelNames = [
+            name for _, channelObj in channelItems.items() for name in channelObj.keys()
+        ]
+        extendResults = await getChannelsByExtendBaseUrls(channelNames)
+        total_channels = len(channelNames)
        pbar = tqdm(total=total_channels)
        pageUrl = await useAccessibleUrl()
+        wait = WebDriverWait(self.driver, 10)
        for cate, channelObj in channelItems.items():
            channelUrls = {}
            channelObjKeys = channelObj.keys()
@ -71,26 +75,41 @@ class UpdateSource:
                pbar.set_description(
                    f"Processing {name}, {total_channels - pbar.n} channels remaining"
                )
-                self.driver.get(pageUrl)
-                search_box = self.driver.find_element(By.XPATH, '//input[@type="text"]')
-                search_box.clear()
-                search_box.send_keys(name)
-                submit_button = self.driver.find_element(
-                    By.XPATH, '//input[@type="submit"]'
-                )
-                submit_button.click()
-                isFavorite = name in config.favorite_list
-                pageNum = (
-                    config.favorite_page_num if isFavorite else config.default_page_num
-                )
                infoList = []
+                for url, date, resolution in extendResults.get(name, []):
+                    if url and checkUrlByPatterns(url):
+                        infoList.append((url, None, resolution))
                if pageUrl:
+                    self.driver.get(pageUrl)
+                    search_box = wait.until(
+                        EC.presence_of_element_located(
+                            (By.XPATH, '//input[@type="text"]')
+                        )
+                    )
+                    search_box.clear()
+                    search_box.send_keys(name)
+                    submit_button = wait.until(
+                        EC.element_to_be_clickable(
+                            (By.XPATH, '//input[@type="submit"]')
+                        )
+                    )
+                    submit_button.click()
+                    isFavorite = name in config.favorite_list
+                    pageNum = (
+                        config.favorite_page_num
+                        if isFavorite
+                        else config.default_page_num
+                    )
                    for page in range(1, pageNum + 1):
                        try:
                            if page > 1:
-                                page_link = self.driver.find_element(
-                                    By.XPATH,
-                                    f'//a[contains(@href, "={page}") and contains(@href, "{name}")]',
+                                page_link = wait.until(
+                                    EC.element_to_be_clickable(
+                                        (
+                                            By.XPATH,
+                                            f'//a[contains(@href, "={page}") and contains(@href, "{name}")]',
+                                        )
+                                    )
                                )
                                page_link.click()
                            soup = BeautifulSoup(self.driver.page_source, "html.parser")
@ -100,12 +119,7 @@ class UpdateSource:
                            for result in results:
                                try:
                                    url, date, resolution = getUrlInfo(result)
-                                    if (
-                                        url
-                                        and checkUrlIPVType(url)
-                                        and checkByDomainBlacklist(url)
-                                        and checkByURLKeywordsBlacklist(url)
-                                    ):
+                                    if url and checkUrlByPatterns(url):
                                        infoList.append((url, date, resolution))
                                except Exception as e:
                                    print(f"Error on result {result}: {e}")
@ -118,7 +132,7 @@ class UpdateSource:
                    if not github_actions or (
                        pbar.n <= 200 and github_actions == "true"
                    ):
-                        sorted_data = await compareSpeedAndResolution(infoList)
+                        sorted_data = await sortUrlsBySpeedAndResolution(infoList)
                        if sorted_data:
                            channelUrls[name] = getTotalUrls(sorted_data)
                            for (url, date, resolution), response_time in sorted_data:
--- a/utils.py
+++ b/utils.py
@ -11,6 +11,8 @@ import os
 import urllib.parse
 import ipaddress
 from urllib.parse import urlparse
+import requests
+import re


 def getChannelItems():
@ -41,10 +43,14 @@ def getChannelItems():
            else:
                # This is a url, add it to the list of urls for the current channel.
                match = re.search(pattern, line)
-                if match:
+                if match is not None:
                    if match.group(1) not in channels[current_category]:
                        channels[current_category][match.group(1)] = [match.group(2)]
-                    else:
+                    elif (
+                        match.group(2)
+                        and match.group(2)
+                        not in channels[current_category][match.group(1)]
+                    ):
                        channels[current_category][match.group(1)].append(
                            match.group(2)
                        )
@ -53,6 +59,60 @@ def getChannelItems():
        f.close()


+async def getChannelsByExtendBaseUrls(channel_names):
+    """
+    Get the channels by extending the base urls
+    """
+    channels = {}
+    pattern = r"^(.*?),(?!#genre#)(.*?)$"
+    sub_pattern = r"_\((.*?)\)|_\[(.*?)\]|频道"
+    for base_url in config.extend_base_urls:
+        try:
+            print(f"Processing extend base url: {base_url}")
+            try:
+                response = requests.get(base_url, timeout=30)
+            except requests.exceptions.Timeout:
+                print(f"Timeout on {base_url}")
+                continue
+            content = response.text
+            if content:
+                lines = content.split("\n")
+                link_dict = {}
+                for line in lines:
+                    if re.match(pattern, line) is not None:
+                        key = re.match(pattern, line).group(1)
+                        resolution_match = re.search(r"_(\((.*?)\))", key)
+                        resolution = (
+                            resolution_match.group(2)
+                            if resolution_match is not None
+                            else None
+                        )
+                        key = re.sub(sub_pattern, "", key).lower()
+                        url = re.match(pattern, line).group(2)
+                        value = (url, None, resolution)
+                        if key in link_dict:
+                            link_dict[key].append(value)
+                        else:
+                            link_dict[key] = [value]
+                found_channels = []
+                for channel_name in channel_names:
+                    sub_channel_name = re.sub(sub_pattern, "", channel_name).lower()
+                    values = link_dict.get(sub_channel_name)
+                    if values:
+                        if channel_name in channels:
+                            channels[channel_name] += values
+                        else:
+                            channels[channel_name] = values
+                        found_channels.append(channel_name)
+                if found_channels:
+                    print(f"{base_url} found channels: {','.join(found_channels)}")
+        except Exception as e:
+            print(f"Error on {base_url}: {e}")
+            continue
+    print("Finished processing extend base urls")
+    return channels
+
+
 def updateChannelUrlsTxt(cate, channelUrls):
    """
    Update the category and channel urls to the final file
@ -89,7 +149,7 @@ def getUrlInfo(result):
            r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
            channel_text,
        )
-        if url_match:
+        if url_match is not None:
            url = url_match.group()
        info_text = result_div[-1].get_text(strip=True)
        if info_text:
@ -122,7 +182,7 @@ async def getSpeed(url, urlTimeout=5):
            return float("inf")


-async def compareSpeedAndResolution(infoList):
+async def sortUrlsBySpeedAndResolution(infoList):
    """
    Sort by speed and resolution
    """
@ -249,6 +309,17 @@ def checkByURLKeywordsBlacklist(url):
    return not any(keyword in url for keyword in url_keywords_blacklist)


+def checkUrlByPatterns(url):
+    """
+    Check the url by patterns
+    """
+    return (
+        checkUrlIPVType(url)
+        and checkByDomainBlacklist(url)
+        and checkByURLKeywordsBlacklist(url)
+    )
+
+
 def filterUrlsByPatterns(urls):
    """
    Filter urls by patterns
--- a/version.json
+++ b/version.json
@ -1,3 +1,3 @@
 {
-  "version": "1.0.9"
+  "version": "1.1.0"
 }