Python爬虫视频教程零基础小白到scrapy爬虫高手-轻松入门
https://item.taobao.com/item.htm?spm=a1z38n.10677092.0.0.482434a6EmUbbW&id=564564604865
# -*- coding: utf-8 -*- """ Created on Sun May 15 20:41:32 2016 @author: daxiong """ import requests,bs4,csv,time,random,os #存放所有二级网址 fileName='combinedFile.csv' #存放二级网址目录 site_hubei="http://china.guidechem.com/suppliers/list_catid-21_area-%E6%B9%96%E5%8C%97" pages_hubei=31 def Get_sites(site,pages): list_pages=[] for page in range(1,pages+1): thePage=site+"-"+"p"+str(page)+".html" list_pages.append(thePage) return list_pages #获取每一页20个公司信息存储在list_rows_information def Get_page_information(elems): #每一页20个公司信息存储在list_rows_information里面 list_rows_information=[] num=len(elems) for i in range(num): try: #公司名称 elems_company_name=elems[i].select(".dblue") company_name=elems_company_name[0].text #主要产品 elems_main_product=elems[i].select("li") main_product=elems_main_product[1].text.strip(" ") #联系方式 elems_contact=elems[i].select(".site_l") content_contact=elems_contact[0].text content_contact1=content_contact.strip(" ") content_contact2=content_contact1.strip(" ") list_content_contact=content_contact2.split(" ") phone=list_content_contact[0] address=list_content_contact[1] list_rows_information.append([company_name,main_product,phone,address]) except: print("error at:",i) continue return list_rows_information #把一页内容写入csv文档 ,list_tableContent为二维列表[[a],[b],[c]] def Write_table_to_csv(fileName,list_tableContent): #对列表格式修改,字符串写入的格式不对 file=open(fileName,'w',newline='') writer1=csv.writer(file) writer1.writerows(list_tableContent) file.close() #写入所有文件 def Write_allTables_to_csvs(list_pages): for i in range(pages_hubei): try: res=requests.get(list_pages[i]) soup=bs4.BeautifulSoup(res.text,"lxml") #综合信息 elems=soup.select(".clist_list_content_r") #获取每一页20个公司信息存储在list_rows_information list_rows_information=Get_page_information(elems) filename=str(i+1)+".csv" Write_table_to_csv(filename,list_rows_information) time.sleep(random.randint(10,15)) except: print("error at:",i) continue #主函数 #获取32页主要网址 list_pages=Get_sites(site_hubei,pages_hubei) ''' #生产所有csv文件 Write_allTables_to_csvs(list_pages) ''' i=3 res=requests.get(list_pages[i]) soup=bs4.BeautifulSoup(res.text,"lxml") elems=soup.select(".clist_list_content_r") #联系方式 elems_contact=elems[2].select(".site_l") content_contact=elems_contact[0].text content_contact1=content_contact.strip(" ") content_contact2=content_contact1.strip(" ") list_content_contact=content_contact2.split(" ") #有时候信息会缺失,用正则表达式筛选text内容 phone=list_content_contact[0] address=list_content_contact[1]
i=2和i=9错误