from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait import time import pandas as pd browser = webdriver.Chrome(executable_path="/usr/local/bin/chromedriver") # browser = webdriver.Chrome() browser.maximize_window() #最大化窗口 wait = WebDriverWait(browser,10) #最大等待时间 def index_page(page): ''' 爬取页面数据 :param page:页数 :return: ''' # url = "http://data.eastmoney.com/bbsj/201806/lrb.html" url = "http://data.eastmoney.com/bbsj/202006/yjkb.html" try: browser.get(url=url) print("正在爬去第%s页"%page) #判断是否是第一页。如果大于1,则输入跳转,否则加载完成 if page>1: #确定页书输入框 input = wait.until(EC.presence_of_element_located((By.ID,"PageContgopage"))) # input.click() input.clear() input.send_keys(page) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#PageCont >a.btn_link"))) submit.click() time.sleep(2) wait.until(EC.presence_of_element_located((By.ID,"dt_1"))) wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#PageCont>span.at"),str(page))) element = browser.find_element_by_css_selector("#dt_1") all_td = element.find_elements_by_tag_name("td") lst = [] for td in all_td: lst.append(td.text) #查看数据总共有多少列 # print(lst) # exit() col = len(element.find_elements_by_css_selector("tr:first-child td")) lst = [lst[i:i+col] for i in range(0,len(all_td),col)] #获取连接 # print(lst) lst_link =[] links = element.find_elements_by_css_selector("#dt_1 a.red") for link in links: link_url = link.get_attribute("href") lst_link.append(link_url) # columns = ["序号", "股票代码", "股票简称", "相关", '净利润(元)', "净利润同比(%)", "营业总收入(元)", "营业总收入同比", # "营业支出(元)", "销售费用(元)", "管理费用(元)", "财务费用(元)", "营业总支出", "营业利润(元)", # "利润总额(元)", "公告日期"] columns = ["序号", "股票代码", "股票简称", "相关", "每股收益", "营业收入(元)", "去年同期(元)", "同比增长", "季度环比增长", "净利润", "去年同期", "同比增长", "季度环比增长", "每股净资产", "净资产收益率", "所处行业", "公告日期"] df_table = pd.DataFrame(lst, columns=columns) df_table["url"] = lst_link # print(df_table) # exit() return df_table except Exception: return None # def get_data(): # ''' # 获取单页数据 # :return: # ''' def main(): all_data = pd.DataFrame() for page in range(1,5): df_table = index_page(page) all_data = pd.concat([all_data,df_table]) # print(all_data) all_data.to_excel("2020年6月上市公司财报数据.xlsx") main()
爬取东方财富财报数据