Невозможно использовать selenium с многопоточностью в python - Сервер Digital Ocean
Я пытаюсь скрафтить проект, используя несколько потоков для оптимизации. Когда я запускаю проект на своем компьютере, он работает хорошо, но когда я развернул его на сервере Digital Ocean, он не работает, бросая Timed out receiving message from renderer: 60.000
.
Программа прекрасно работает без многопоточности, но выполняется очень медленно.
Сервер представляет собой классический django-сервер, а скребок запускается в модуле django views после POST-запроса.
Мой компьютер:
- macOS Sonoma 14.2.1
- MacBook Air M2
- 8 гб оперативной памяти
Цифровая океанская капля:
- Ubuntu 23.10 x64
- 1 AMD vCPU
- 1 ГБ RAM
- 25 GB Disk + 10 GB
Релевантный код:
from pathlib import Path
from multiprocessing import Manager
from multiprocessing.pool import ThreadPool
from .scrapers.scraper_onu_odc import scrap_from_onu_odc
from .scrapers.scraper_peps import scrap_from_senaclaft_peps
from .scrapers.scraper_bcu import scrap_from_bcu_infractores_cheques
from .scrapers.scraper_onu_scs import scrap_from_onu_scs
from .scrapers.scraper_ofac import scrap_from_ofac
from .scrapers.scraper_google import scrap_from_google
from .scrapers.scraper_yahoo import scrap_from_yahoo
from .scrapers.scraper_bing import scrap_from_bing
from .scrapers.scraper_wikipedia import scrap_from_wikipedia
from .scrapers.scraper_fincen import scrap_from_fincen
BASE_DIR = Path(__file__).resolve().parent.parent.parent
def scrapping_pool(scraper):
scraper[0](*scraper[1])
def get_search_results(info, uuid):
paths = {
'screenshots_path' : BASE_DIR / f'searcher/screenshots/{uuid}/',
'certificate_path' : str(BASE_DIR / 'searcher/ssl_certificates/bcu.pem')
}
manager = Manager()
results = manager.dict()
person_to_search = info['person_to_search_name'] + ' ' + info['person_to_search_surname']
params = (person_to_search, results, paths)
scrapers = [
(scrap_from_onu_odc, params),
(scrap_from_senaclaft_peps, params),
(scrap_from_bcu_infractores_cheques, params),
(scrap_from_onu_scs, params),
(scrap_from_ofac, params),
(scrap_from_fincen, params),
(scrap_from_google, params),
(scrap_from_yahoo, params),
(scrap_from_bing, params),
(scrap_from_wikipedia, params)
]
try:
with ThreadPool(8) as pool:
for result in pool.map(scrapping_pool, scrapers):
print(result)
pool.close()
except Exception as e:
print(e)
return results, paths['screenshots_path']
пример скрепера (есть и похожие):
import os
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from ...pages import get_pages
from .utils import get_selenium_params
def scrap_from_yahoo(name, results_dic, paths):
page_info = get_pages()
screenshot_folder = paths['screenshots_path']
selenium_params = get_selenium_params()
browser = webdriver.Chrome(service=selenium_params['service'], options=selenium_params['options'])
url = page_info['yahoo']['url'] + name.replace(' ', '+')
# Create new folder with named screens
os.makedirs(screenshot_folder, exist_ok=True)
screenshot_path = f'{screenshot_folder}/yahoo.png'
has_matches = False
try:
browser.get(url)
browser.set_window_size(1366, 728)
browser.save_screenshot(screenshot_path)
has_matches = True
except Exception as e:
has_matches = False
results_dic['yahoo'] = (has_matches, screenshot_path, screenshot_folder, url)
Я попробовал использовать следующие опции с chromedriver:
def get_selenium_params():
driver_path = BASE_DIR / 'driver/chromedriver-linux64/chromedriver'
driver_options = webdriver.ChromeOptions()
driver_options.binary_location = "/usr/bin/chromium-browser"
driver_options.add_argument('--headless')
driver_options.add_argument('--start-maximized')
driver_options.add_argument('--remote-debugging-pipe')
driver_options.add_argument("enable-automation")
driver_options.add_argument("--no-sandbox")
driver_options.add_argument("--disable-extensions")
driver_options.add_argument("--dns-prefetch-disable")
driver_options.add_argument("--disable-gpu")
driver_options.add_argument('--disable-dev-shm-usage')
return {
'service': Service(executable_path=driver_path),
'options': driver_options
}
Ошибка:
Message: timeout: Timed out receiving message from renderer: 60.000
Stacktrace:
#0 0x594000d8b8a3 <unknown>
#1 0x594000a818c6 <unknown>
#2 0x594000a697a0 <unknown>
#3 0x594000a67516 <unknown>
#4 0x594000a67b9f <unknown>
#5 0x594000ab53de <unknown>
#6 0x594000ab1d3d <unknown>
#7 0x594000afaaed <unknown>
#8 0x594000aee343 <unknown>
#9 0x594000abf593 <unknown>
#10 0x594000abff5e <unknown>
#11 0x594000d4f88b <unknown>
#12 0x594000d537e5 <unknown>
#13 0x594000d3d5b1 <unknown>
#14 0x594000d54372 <unknown>
#15 0x594000d228bf <unknown>
#16 0x594000d7a768 <unknown>
#17 0x594000d7a93b <unknown>
#18 0x594000d8a9f4 <unknown>
#19 0x707d49897b5a <unknown>