feat:driver retry

2024-06-28 17:54:27 +08:00 · 2024-06-28 17:54:27 +08:00 · 57b3d72835
commit 57b3d72835
parent c021b838e4
1 changed files with 86 additions and 50 deletions
--- a/utils.py
+++ b/utils.py
@ -18,6 +18,7 @@ from tqdm.asyncio import tqdm_asyncio
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException
 from selenium_stealth import stealth
 import concurrent.futures
 import sys
@ -34,6 +35,7 @@ def retry_func(func, retries=max_retries + 1, name=""):
    """
    for i in range(retries):
        try:
+            sleep(3)
            return func()
        except Exception as e:
            count = retries - 1
@ -45,6 +47,34 @@ def retry_func(func, retries=max_retries + 1, name=""):
                continue


+def locate_element_with_retry(driver, locator, timeout=timeout, retries=max_retries):
+    """
+    Locate the element with retry
+    """
+    wait = WebDriverWait(driver, timeout)
+    for _ in range(retries):
+        try:
+            return wait.until(EC.presence_of_element_located(locator))
+        except TimeoutException:
+            driver.refresh()
+    return None
+
+
+def find_clickable_element_with_retry(
+    driver, locator, timeout=timeout, retries=max_retries
+):
+    """
+    Find the clickable element with retry
+    """
+    wait = WebDriverWait(driver, timeout)
+    for _ in range(retries):
+        try:
+            return wait.until(EC.element_to_be_clickable(locator))
+        except TimeoutException:
+            driver.refresh()
+    return None
+
+
 def resource_path(relative_path, persistent=False):
    """
    Get the resource path
@ -128,8 +158,7 @@ def get_proxy_list(page_count=1):
    ]
    proxy_list = []
    driver = setup_driver()
-    pbar = tqdm_asyncio(total=page_count)
-    pbar.set_description(f"Getting proxy list, {page_count} pages remaining")
+    pbar = tqdm_asyncio(total=page_count, desc="Getting proxy list")
    for page_index in range(1, page_count + 1):
        for pattern in url_pattern:
            url = pattern.format(page_index)
@ -151,9 +180,6 @@ def get_proxy_list(page_count=1):
                proxy = f"http://{ip}:{port}"
                proxy_list.append(proxy)
        pbar.update()
-        pbar.set_description(
-            f"Getting proxy list, {page_count - page_index} pages remaining"
-        )
    pbar.close()
    return proxy_list

@ -362,69 +388,81 @@ async def get_channels_by_online_search(names, callback):

    def process_channel_by_online_search(name, proxy=None):
        driver = setup_driver(proxy)
-        wait = WebDriverWait(driver, timeout)
        info_list = []
        try:
            retry_func(lambda: driver.get(pageUrl), name=f"online search:{name}")
-            search_box = retry_func(
-                lambda: wait.until(
-                    EC.presence_of_element_located((By.XPATH, '//input[@type="text"]'))
-                )
+            search_box = locate_element_with_retry(
+                driver, (By.XPATH, '//input[@type="text"]')
            )
            if not search_box:
                return
            search_box.clear()
            search_box.send_keys(name)
-            submit_button = retry_func(
-                lambda: wait.until(
-                    EC.element_to_be_clickable((By.XPATH, '//input[@type="submit"]'))
-                )
+            submit_button = find_clickable_element_with_retry(
+                driver, (By.XPATH, '//input[@type="submit"]')
            )
            if not submit_button:
                return
+            sleep(3)
            driver.execute_script("arguments[0].click();", submit_button)
            isFavorite = name in config.favorite_list
            pageNum = (
                config.favorite_page_num if isFavorite else config.default_page_num
            )
+            retry_limit = 3
            for page in range(1, pageNum + 1):
-                try:
-                    if page > 1:
-                        sleep(1)
-                        page_link = retry_func(
-                            lambda: wait.until(
-                                EC.element_to_be_clickable(
-                                    (
-                                        By.XPATH,
-                                        f'//a[contains(@href, "={page}") and contains(@href, "{name}")]',
-                                    )
-                                )
+                retries = 0
+                while retries < retry_limit:
+                    try:
+                        if page > 1:
+                            page_link = find_clickable_element_with_retry(
+                                driver,
+                                (
+                                    By.XPATH,
+                                    f'//a[contains(@href, "={page}") and contains(@href, "{name}")]',
+                                ),
                            )
+                            if not page_link:
+                                break
+                            sleep(3)
+                            driver.execute_script("arguments[0].click();", page_link)
+                        sleep(3)
+                        source = re.sub(
+                            r"<!--.*?-->",
+                            "",
+                            driver.page_source,
+                            flags=re.DOTALL,
                        )
-                        if not page_link:
+                        soup = BeautifulSoup(source, "html.parser")
+                        if soup:
+                            results = get_results_from_soup(soup, name)
+                            print(name, "page:", page, "results num:", len(results))
+                            if len(results) == 0 and retries < retry_limit - 1:
+                                print(
+                                    f"{name}:No results found, refreshing page and retrying..."
+                                )
+                                driver.refresh()
+                                retries += 1
+                                continue
+                            for result in results:
+                                url, date, resolution = result
+                                if url and check_url_by_patterns(url):
+                                    info_list.append((url, date, resolution))
+                            break
+                        else:
+                            print(
+                                f"{name}:No results found, refreshing page and retrying..."
+                            )
+                            driver.refresh()
+                            retries += 1
                            continue
-                        driver.execute_script("arguments[0].click();", page_link)
-                    source = re.sub(
-                        r"<!--.*?-->",
-                        "",
-                        driver.page_source,
-                        flags=re.DOTALL,
-                    )
-                    soup = BeautifulSoup(source, "html.parser")
-                    if soup:
-                        results = get_results_from_soup(soup, name)
-                        print(name, "page:", page, "results num:", len(results))
-                        for result in results:
-                            url, date, resolution = result
-                            if url and check_url_by_patterns(url):
-                                info_list.append((url, date, resolution))
-                    else:
-                        print(f"No results found for {name}")
-                except Exception as e:
-                    print(f"Error on page {page}: {e}")
-                    continue
+                    except Exception as e:
+                        print(f"{name}:Error on page {page}: {e}")
+                        break
+                if retries == retry_limit:
+                    print(f"{name}:Reached retry limit, moving to next page")
        except Exception as e:
-            print(f"Error on search: {e}")
+            print(f"{name}:Error on search: {e}")
            pass
        finally:
            channels[format_channel_name(name)] = info_list
@ -451,9 +489,7 @@ async def get_channels_by_online_search(names, callback):
            loop = asyncio.get_running_loop()
            name = await names_queue.get()
            proxy = (
-                proxy_list_test[proxy_index]
-                if config.open_proxy and proxy_list_test
-                else None
+                proxy_list_test[0] if config.open_proxy and proxy_list_test else None
            )
            if config.open_proxy and proxy_list_test:
                proxy_index = (proxy_index + 1) % len(proxy_list_test)