Невозможно использовать selenium с многопоточностью в python - Сервер Digital Ocean

Я пытаюсь скрафтить проект, используя несколько потоков для оптимизации. Когда я запускаю проект на своем компьютере, он работает хорошо, но когда я развернул его на сервере Digital Ocean, он не работает, бросая Timed out receiving message from renderer: 60.000.

Программа прекрасно работает без многопоточности, но выполняется очень медленно.

Сервер представляет собой классический django-сервер, а скребок запускается в модуле django views после POST-запроса.

Мой компьютер:

  • macOS Sonoma 14.2.1
  • MacBook Air M2
  • 8 гб оперативной памяти

Цифровая океанская капля:

  • Ubuntu 23.10 x64
  • 1 AMD vCPU
  • 1 ГБ RAM
  • 25 GB Disk + 10 GB

Релевантный код:

from pathlib import Path
from multiprocessing import Manager
from multiprocessing.pool import ThreadPool

from .scrapers.scraper_onu_odc import scrap_from_onu_odc
from .scrapers.scraper_peps import scrap_from_senaclaft_peps
from .scrapers.scraper_bcu import scrap_from_bcu_infractores_cheques
from .scrapers.scraper_onu_scs import scrap_from_onu_scs
from .scrapers.scraper_ofac import scrap_from_ofac
from .scrapers.scraper_google import scrap_from_google
from .scrapers.scraper_yahoo import scrap_from_yahoo
from .scrapers.scraper_bing import scrap_from_bing
from .scrapers.scraper_wikipedia import scrap_from_wikipedia
from .scrapers.scraper_fincen import scrap_from_fincen

BASE_DIR = Path(__file__).resolve().parent.parent.parent

def scrapping_pool(scraper):
    scraper[0](*scraper[1])


def get_search_results(info, uuid):

    paths = {
    'screenshots_path' : BASE_DIR / f'searcher/screenshots/{uuid}/',
    'certificate_path' : str(BASE_DIR / 'searcher/ssl_certificates/bcu.pem')
    }
    
    manager = Manager()
    results = manager.dict()

    person_to_search = info['person_to_search_name'] + ' ' + info['person_to_search_surname']

    params = (person_to_search, results, paths)
    scrapers = [
        (scrap_from_onu_odc, params),
        (scrap_from_senaclaft_peps, params),
        (scrap_from_bcu_infractores_cheques, params),
        (scrap_from_onu_scs, params),
        (scrap_from_ofac, params),
        (scrap_from_fincen, params),
        (scrap_from_google, params),
        (scrap_from_yahoo, params),
        (scrap_from_bing, params),
        (scrap_from_wikipedia, params)
    ]

    try:
        with ThreadPool(8) as pool:
            for result in pool.map(scrapping_pool, scrapers):
                print(result)
        pool.close()
    except Exception as e:
        print(e)

    return results, paths['screenshots_path']

пример скрепера (есть и похожие):

import os
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

from ...pages import get_pages
from .utils import get_selenium_params

def scrap_from_yahoo(name, results_dic, paths):
    page_info = get_pages()
    screenshot_folder = paths['screenshots_path']
    selenium_params = get_selenium_params()

    browser = webdriver.Chrome(service=selenium_params['service'], options=selenium_params['options'])

    url = page_info['yahoo']['url'] + name.replace(' ', '+')
    # Create new folder with named screens
    
    os.makedirs(screenshot_folder, exist_ok=True)
    screenshot_path = f'{screenshot_folder}/yahoo.png'

    has_matches = False
    try:
        browser.get(url)
        browser.set_window_size(1366, 728)
        browser.save_screenshot(screenshot_path)
        has_matches = True
        
    except Exception as e:
        has_matches = False
        
    results_dic['yahoo'] = (has_matches, screenshot_path, screenshot_folder, url)

Я попробовал использовать следующие опции с chromedriver:

def get_selenium_params():
    driver_path = BASE_DIR / 'driver/chromedriver-linux64/chromedriver'
    driver_options = webdriver.ChromeOptions()
    driver_options.binary_location = "/usr/bin/chromium-browser"
    driver_options.add_argument('--headless')
    driver_options.add_argument('--start-maximized')
    driver_options.add_argument('--remote-debugging-pipe')
    driver_options.add_argument("enable-automation")
    driver_options.add_argument("--no-sandbox")
    driver_options.add_argument("--disable-extensions")
    driver_options.add_argument("--dns-prefetch-disable")
    driver_options.add_argument("--disable-gpu")
    driver_options.add_argument('--disable-dev-shm-usage')

    return {
        'service': Service(executable_path=driver_path),
        'options': driver_options
    }

Ошибка:

Message: timeout: Timed out receiving message from renderer: 60.000
Stacktrace:
#0 0x594000d8b8a3 <unknown>
#1 0x594000a818c6 <unknown>
#2 0x594000a697a0 <unknown>
#3 0x594000a67516 <unknown>
#4 0x594000a67b9f <unknown>
#5 0x594000ab53de <unknown>
#6 0x594000ab1d3d <unknown>
#7 0x594000afaaed <unknown>
#8 0x594000aee343 <unknown>
#9 0x594000abf593 <unknown>
#10 0x594000abff5e <unknown>
#11 0x594000d4f88b <unknown>
#12 0x594000d537e5 <unknown>
#13 0x594000d3d5b1 <unknown>
#14 0x594000d54372 <unknown>
#15 0x594000d228bf <unknown>
#16 0x594000d7a768 <unknown>
#17 0x594000d7a93b <unknown>
#18 0x594000d8a9f4 <unknown>
#19 0x707d49897b5a <unknown>
Вернуться на верх