Python爬虫视频教程零基础小白到scrapy爬虫高手-轻松入门
https://item.taobao.com/item.htm?spm=a1z38n.10677092.0.0.482434a6EmUbbW&id=56456460486
需要切换ip,采集过多会被封杀
# -*- coding: utf-8 -*- """ Created on Tue May 17 16:26:31 2016 @author: Administrator """ import requests,bs4,csv,time,random,os #存放所有二级网址 fileName='combinedFile.csv' #存放二级网址目录 site_hubei="http://china.guidechem.com/suppliers/list_catid-21_area-%E6%B9%96%E5%8C%97" pages_hubei=31 def Get_sites(site,pages): list_pages=[] for page in range(1,pages+1): thePage=site+"-"+"p"+str(page)+".html" list_pages.append(thePage) return list_pages def Get_company_name(elems,i): elems_company_name=elems[i].select(".dblue") if len(elems_company_name)==0: #如果找不到元素,则空起 company_name="" return company_name company_name=elems_company_name[0].text return company_name def Get_main_product(elems,i): elems_main_product=elems[i].select("li") if len(elems_main_product)==0: #如果找不到元素,则空起 main_product="" return main_product main_product=elems_main_product[1].text.strip(" ") return main_product def Get_phone_address(elems,i): elems_contact=elems[i].select(".site_l") content_contact=elems_contact[0].text content_contact1=content_contact.strip(" ") content_contact2=content_contact1.strip(" ") list_content_contact=content_contact2.split(" ") #有时候信息会缺失,用正则表达式筛选text内容 if len(list_content_contact)==2: phone=list_content_contact[0] address=list_content_contact[1] if len(list_content_contact)==1: content=list_content_contact[0] if "地址" in content: address=content phone="" if "电话" in content: phone=content address="" phone_address=(phone,address) return phone_address #获取每一页20个公司信息存储在list_rows_information def Get_page_information(elems): #每一页20个公司信息存储在list_rows_information里面 list_rows_information=[] num=len(elems) for i in range(num): try: #公司名称 company_name=Get_company_name(elems,i) #主要产品 main_product=Get_main_product(elems,i) #联系方式 phone_address=Get_phone_address(elems,i) phone=phone_address[0] address=phone_address[1] list_rows_information.append([company_name,main_product,phone,address]) except: print("error at:",i) continue return list_rows_information #把一页内容写入csv文档 ,list_tableContent为二维列表[[a],[b],[c]] def Write_table_to_csv(fileName,list_tableContent): #对列表格式修改,字符串写入的格式不对 file=open(fileName,'w',newline='') writer1=csv.writer(file) writer1.writerows(list_tableContent) file.close() #写入所有文件 def Write_allTables_to_csvs(list_pages): for i in range(pages_hubei): try: res=requests.get(list_pages[i]) soup=bs4.BeautifulSoup(res.text,"lxml") #综合信息 elems=soup.select(".clist_list_content_r") #获取每一页20个公司信息存储在list_rows_information list_rows_information=Get_page_information(elems) filename=str(i+1)+".csv" Write_table_to_csv(filename,list_rows_information) time.sleep(random.randint(10,15)) except: print("error at:",i) continue #主函数 #获取32页主要网址 list_pages=Get_sites(site_hubei,pages_hubei) #生产所有csv文件 Write_allTables_to_csvs(list_pages) ''' 测试 i=3 res=requests.get(list_pages[i]) soup=bs4.BeautifulSoup(res.text,"lxml") elems=soup.select(".clist_list_content_r") #联系方式 elems_contact=elems[2].select(".site_l") content_contact=elems_contact[0].text content_contact1=content_contact.strip(" ") content_contact2=content_contact1.strip(" ") list_content_contact=content_contact2.split(" ") #有时候信息会缺失,用正则表达式筛选text内容 if len(list_content_contact)==2: phone=list_content_contact[0] address=list_content_contact[1] if len(list_content_contact)==1: content=list_content_contact[0] if "地址" in content: address=content phone=[] if "电话" in content: phone=content address=[] '''