Запросить помощь по вопросам веб-скраппинга
Я пытался найти данные на двух веб-сайтах, но столкнулся с проблемами. Я буду очень рад, если кто-нибудь сможет помочь в решении проблемы
1.https://online.capitalcube.com/ На сайте необходимо войти в систему. Я придумал следующий код после просмотра обучающих материалов на youtube в течение последних 2 дней.
from bs4 import BeautifulSoup
import pandas as pd
import requests
URL = 'https://online.capitalcube.com/'
LOGIN_ROUTE = '/login'
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:93.0) Gecko/20100101 Firefox/93.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'TE': 'trailers',
}
s = requests.session()
login_payload = {
'email': '<intentionally removed it>',
'password': '<intentionally removed it>'
}
login_req = s.post(URL + LOGIN_ROUTE, headers = headers, data = login_payload)
print(login_req.status_code)
Ошибка, которую я получаю, выглядит следующим образом
- The other website I am trying is stockedge.com I have come up with the following code
import requests
from bs4 import BeautifulSoup
import pandas as pd
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:93.0) Gecko/20100101 Firefox/93.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Connection': 'keep-alive',
}
ticker = 'hdfc-bank/5051?'
urls = {}
urls['balancesheet consolidated'] = f"https://web.stockedge.com/share/{ticker}section=balance-sheet"
urls['balancesheet standalone'] = f"https://web.stockedge.com/share/{ticker}section=balance-sheet&statement-type=standalone"
urls['profitloss consolidated'] = f"https://web.stockedge.com/share/{ticker}section=profit-loss&statement-type=consolidated"
urls['profitloss standalone'] = f"https://web.stockedge.com/share/{ticker}section=profit-loss&statement-type=standalone"
urls['cashflow consolidated'] = f"https://web.stockedge.com/share/{ticker}section=cash-flow"
urls['cashflow standalone'] = f"https://web.stockedge.com/share/{ticker}section=cash-flow&statement-type=standalone"
urls['quarterlyresults consolidated'] = f"https://web.stockedge.com/share/{ticker}section=results"
urls['quarterlyresults standalone'] = f"https://web.stockedge.com/share/{ticker}section=results&active-statement-type=Standalone"
urls['shareholding pattern'] = f"https://web.stockedge.com/share/{ticker}section=pattern"
urls['return ratios'] = f"https://web.stockedge.com/share/{ticker}section=ratios&ratio-id=roe"
urls['efficiency ratios'] = f"https://web.stockedge.com/share/{ticker}section=ratios&ratio-id=roe&ratio-category=efficiencyratios"
urls['growth ratios'] = f"https://web.stockedge.com/share/{ticker}section=ratios&ratio-id=roe&ratio-category=growthratios"
urls['solvency ratios'] = f"https://web.stockedge.com/share/{ticker}section=ratios&ratio-id=net_sales_growth&ratio-category=solvencyratios"
urls['cashflow ratios'] = f"https://web.stockedge.com/share/{ticker}section=ratios&ratio-id=net_sales_growth&ratio-category=cashflowratios"
urls['valuation ratios'] = f"https://web.stockedge.com/share/{ticker}section=ratios&ratio-id=net_sales_growth&ratio-category=valuationratios"
xlwriter = pd.ExcelWriter(f'financial statements ({ticker}).xlsx', engine='xlsxwriter')
for key in urls.keys():
response = requests.get(urls[key], headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
df = pd.read_html(str(soup), attrs={'class': 'background md list-md hydrated'})[0]
df.to_excel(xlwriter, sheet_name=key, index=False)
xlwriter.save()
Ошибка, которую я получаю
runfile('/Users/rafatsiddiqui/Downloads/scientificProject/Company Financial Webscrape.py', wdir='/Users/rafatsiddiqui/Downloads/scientificProject') Traceback (последний последний вызов): Файл "", строка 1, in File "/Applications/PyCharm.app/Contents/plugins/python/helpers/pydev/_pydev_bundle/pydev_umd.py", line 198, in runfile pydev_imports.execfile(filename, global_vars, local_vars) # выполнить сценарий Файл "/Applications/PyCharm.app/Contents/plugins/python/helpers/pydev/_pydev_imps/_pydev_execfile.py", строка 18, в execfile exec(compile(contents+"\n", file, 'exec'), glob, loc) Файл "/Users/rafatsiddiqui/Downloads/scientificProject/Company Financial Webscrape.py", строка 36, in xlwriter = pd.ExcelWriter(f'финансовые отчеты ({ticker}).xlsx', engine='xlsxwriter') Файл "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/site-packages/pandas/io/excel/_xlsxwriter.py", строка 191, in init. super().init( Файл "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/site-packages/pandas/io/excel/_base.py", строка 925, in init self.handles = get_handle( Файл "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/site-packages/pandas/io/common.py", строка 711, in get_handle handle = open(handle, ioargs.mode) FileNotFoundError: [Errno 2] Нет такого файла или каталога: 'financial statements (hdfc-bank/5051?).xlsx'