前言:大大小小的电商网站爬了不少。结论就是分两种类型:
第一:requests 直接获取
第二:网页动态加载,requests获取失败
直接分享代码吧
1.先导入需要的库和chromedriver的地址(爬动态加载的网页需要,若是requests可直接获取的网站可忽略)
import time,re,pandas as pd,os,requests from selenium import webdriver from bs4 import BeautifulSoup CHROME_DRIVER_PATH = '/Users/xxxx/PycharmProjects/爬虫/chromedriver'
2.我先给出主函数,里面方法我会在下面贴出来
我爬的是电商网站,自然是爬去列表页的商品信息(商品描述,商品链接,商品售价,商品原价)
那么下面是爬静态网页的核心函数
#处理静态网页的 def dealSoup(now_soup,cate_name,cate_url,now_page_num): #获取有层级的分类 cate_span_tag_list = now_soup.select('.category-breadcrumb li ') cate_all_text = '' for span_tag in cate_span_tag_list: cate_all_text += f"{span_tag.text.strip()}" #获得页数 total_page_num = 1 total_num_tag_list = now_soup.select('.site-pager li') if len(total_num_tag_list) == 0: pass elif len(total_num_tag_list) == 1: total_num_tag = total_num_tag_list[1] total_num = extractNum(total_num_tag.text) print(int(total_num)) total_page_num = int(total_num) else: total_num_tag = total_num_tag_list[-2] total_num = extractNum(total_num_tag.text) print(int(total_num)) total_page_num = int(total_num) #遍历全部商品 tag_list = now_soup.select('.category-list div.item') if len(tag_list) > 0: print(len(tag_list)) item_list = [] for tag in tag_list: item = { 'cate_name_all' : cate_all_text[:-1], 'cate_name' : cate_name, 'cate_url' : cate_url, 'product_now_price' : 'null', 'product_old_price' : 'null' } desc_tag = tag.select('.name > a')[0] price_tag_list = tag.select('.my-shop-price') item['product_desc'] = desc_tag.text.strip() item['product_link'] = desc_tag.attrs['href'] if len(price_tag_list) > 0: item['product_now_price'] = price_tag_list[0].attrs['data-oprice'] item['product_old_price'] = price_tag_list[0].attrs['data-oprice'] if len(price_tag_list) > 1: item['product_old_price'] = price_tag_list[1].attrs['data-oprice'] print(item) item_list.append(item) objListToExcel(item_list,heads_0,f"{save_dir}/{cate_name}_{now_page_num}.xlsx") return True,total_page_num else: return False,total_page_num if __name__ == "__main__": #需要爬去的列表页链接 #cate_url:列表页url #cate_name:你对这个列表页的分类定义 ALL_CATE_LIST = [ {'cate_url': 'https://www.adorawe.net/category/denim-pants-c_808.html', 'cate_name': 'Pants1'}, {'cate_url': 'https://www.adorawe.net/category/casual-pants-c_809.html', 'cate_name': 'Pants'}, ] #设置一个文件加用来存爬取的信息 save_dir = '/Users/xxxx/Desktop/adorawe' if not os.path.exists(save_dir): os.mkdir(save_dir) #开始爬列表页 for cate_obj in ALL_CATE_LIST: #获得BeautifulSoup格式的网页文件 soup = get_static_html(cate_obj['cate_url']) #处理网页,保存本页商品数据,获得该列表页的总页数 go_status,page_num = dealSoup(soup, cate_obj['cate_name'], cate_obj['cate_url'], 1) #翻页爬取 for i in range(1,page_num): body_url = cate_obj['cate_url'].replace('.html','') tmp_url = f"{body_url}-page-{i+1}.html" tmp_soup = get_static_html(tmp_url) go_status, page_num = dealSoup(tmp_soup, cate_obj['cate_name'], tmp_url, i+1) #因为是每页的商品数据单独保存,,所以需要合并成一个 connectToOne(save_dir, '/Users/xxx/Desktop', 'adorawe.xlsx')
下面是爬 动态网页的
#处理动态加载网页的 def dealSoup(driver,cate_name,cate_url,page_num): now_data = driver.page_source now_soup = BeautifulSoup(now_data, 'html.parser') #获取有层级的分类 cate_span_tag_list = now_soup.select('ul.breadcrumb > li') cate_all_text = '' for cate_span in cate_span_tag_list: cate_all_text += f"{cate_span.text.strip()}/" #遍历全部商品 tag_list = now_soup.select('div.product-list-container > .product-item') if len(tag_list) > 0: print(len(tag_list)) item_list = [] for tag in tag_list: item = { 'cate_name_all' : cate_all_text[:-1], 'cate_name' : cate_name, 'cate_url' : cate_url, 'product_now_price' : 'null', 'product_old_price' : 'null' } desc_tag = tag.select('.product-item-name')[0] link_tag = desc_tag.select('a')[-1] final_price_tag_list = tag.select('.product-item-final-price-js') del_price_tag_list = tag.select('.product-item-del-price-js') item['product_desc'] = desc_tag.text.strip() item['product_link'] = link_tag.attrs['href'] if len(final_price_tag_list) > 0: item['product_now_price'] = final_price_tag_list[0].text.strip() item['product_old_price'] = final_price_tag_list[0].text.strip() if len(del_price_tag_list) > 0: item['product_old_price'] = del_price_tag_list[0].text.strip() print(item) item_list.append(item) objListToExcel(item_list,heads_0,f"{save_dir}/{cate_name}_{page_num}.xlsx") return True else: return False if __name__ == "__main__": # 需要爬去的列表页链接 # cate_url:列表页url # cate_name:你对这个列表页的分类定义 # total_page:这个列表页的总页数 ALL_CATE_LIST = [ {'cate_url': 'https://sea.newchic.com/pajamas-and-robes-c-4185/?country=188&SEA=0', 'cate_name': 'Loungewear', 'total_page': 9}, {'cate_url': 'https://sea.newchic.com/womens-shoes-c-3592/?country=188&SEA=0', 'cate_name': 'Shoes', 'total_page': 62 }, ] #设置一个文件加用来存爬取的信息 save_dir = '/Users/xxx/Desktop/newchic' if not os.path.exists(save_dir): os.mkdir(save_dir) #模拟浏览器打开网页 site_url_0 = ALL_CATE_LIST[0]['cate_url'] print('开始加载', site_url_0, '动态页面') chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--ignore-ssl-errors') driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH, chrome_options=chrome_options) driver.set_page_load_timeout(100) driver.set_window_size(1420, 780) driver.get(site_url_0) #由于是懒加载,需要模拟滚动屏幕,是页面加载全部的商品 #第二个参数是滚动距离,根据爬取的页面调整大小,使得商品全部加载即可 fullpage_screenshot(driver,10000) time.sleep(5) #处理该页面,并存储到本地 dealSoup(driver, ALL_CATE_LIST[0]['cate_name'], ALL_CATE_LIST[0]['cate_url'], 0) #开始爬列表页 for cate_obj in ALL_CATE_LIST: driver.get(cate_obj['cate_url']) fullpage_screenshot(driver, 10000) dealSoup(driver, cate_obj['cate_name'], cate_obj['cate_url'], 0) #判断是否可以翻页 go_status = True for i in range(1,cate_obj['total_page']): if go_status: next_page_tag_list = driver.find_elements_by_css_selector('.page-item-next') if len(next_page_tag_list) > 0: next_page_tag_list[0].click() time.sleep(3) fullpage_screenshot(driver, 6000) go_status = dealSoup(driver,cate_obj['cate_name'],cate_obj['cate_url'],i) else: go_status = False time.sleep(10) driver.quit() # 因为是每页的商品数据单独保存,,所以需要合并成一个 connectToOne(save_dir, '/Users/xxx/Desktop', 'newchic.xlsx')
最后:
用到的其他方法,我就一次性粘贴了:
# 模拟滚动 def fullpage_screenshot(driver, total_height): total_width = driver.execute_script("return document.body.offsetWidth") # total_height = driver.execute_script("return document.body.parentNode.scrollHeight") # total_height = 50000 viewport_width = driver.execute_script("return document.body.clientWidth") viewport_height = driver.execute_script("return window.innerHeight") rectangles = [] i = 0 while i < total_height: ii = 0 top_height = i + viewport_height if top_height > total_height: top_height = total_height while ii < total_ top_width = ii + viewport_width if top_width > total_ top_width = total_width rectangles.append((ii, i, top_width, top_height)) ii = ii + viewport_width i = i + viewport_height previous = None part = 0 for rectangle in rectangles: if not previous is None: driver.execute_script("window.scrollTo({0}, {1})".format(rectangle[0], rectangle[1])) time.sleep(0.2) file_name = "part_{0}.png".format(part) # driver.get_screenshot_as_file(file_name) if rectangle[1] + viewport_height > total_height: offset = (rectangle[0], total_height - viewport_height) else: offset = (rectangle[0], rectangle[1]) part = part + 1 previous = rectangle return True heads_0 = ['cate_name_all','cate_name', 'cate_url', 'product_link', 'product_desc','product_now_price','product_old_price'] def objListToExcel(objlist,column_arr,out_path): df_data_source = {} for filed in column_arr: df_data_source[filed] = [] if len(objlist) == 0: return 0 for obj in objlist: for key_0 in column_arr: df_data_source[key_0].append(obj[key_0]) df_data = pd.DataFrame(df_data_source,columns=column_arr) df_data.to_excel(out_path,index=False) def extractPriceNum(price_str): # 价格正则 price_pattern = re.compile(r'[0-9]+.[0-9]{2}') price_num_arr = re.findall(price_pattern,price_str) if len(price_num_arr) > 0: return price_num_arr[0] else: return 'null' def extractNum(test_str): # 价格正则 price_pattern = re.compile(r'[0-9]+') num_arr = re.findall(price_pattern,test_str) if len(num_arr) > 0: return int(num_arr[0]) else: return 1 def connectToOne(dir, to_dir, out_file_name): excel_list = [] for file in os.listdir(dir): if file.endswith('.xlsx') and '.~' not in file : print("file:", file) excel_list.append( pd.read_excel(os.path.join(dir, file), dtype={'cate_url': str, 'product_link': str}, )) print('开始合并') total_excel = pd.concat(excel_list) print('生成文件') writer = pd.ExcelWriter(os.path.join(to_dir, out_file_name), engine='xlsxwriter', options={'strings_to_urls': False}) print(os.path.join(to_dir, out_file_name), writer) total_excel.to_excel(writer, index=False) writer.close()
————————————————
版权声明:本文为CSDN博主「blues_phone」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/huangmengfeng/article/details/116146346