sklearn实战-乳腺癌细胞数据挖掘(博主亲自录制视频)
多线程采集,1秒搞定
# -*- coding: utf-8 -*- """ Created on Tue May 17 16:26:31 2016 采集下来excel文件小于2kb的有问题 @author: Administrator """ import requests,bs4,csv,time,random,os,threading #存放所有二级网址 fileName='combinedFile.csv' #存放二级网址目录 bad_urls=[] site_hubei="http://china.guidechem.com/suppliers/list_catid-21_area-%E6%B9%96%E5%8C%97" site_guangdong="http://china.guidechem.com/suppliers/list_catid-21_area-广东" site_shanghai="http://china.guidechem.com/suppliers/list_catid-21_area-%E4%B8%8A%E6%B5%B7" site_shanxi="http://china.guidechem.com/suppliers/list_catid-21_area-陕西" pages_hubei=31 pages_guangdong=21 pages_shanghai=34 pages_shanxi=15 start_page=0 def Get_sites(site,pages): list_pages=[] for page in range(1,pages+1): thePage=site+"-"+"p"+str(page)+".html" list_pages.append(thePage) return list_pages def Get_company_name(elems,i): elems_company_name=elems[i].select(".dblue") if len(elems_company_name)==0: #如果找不到元素,则空起 company_name="" return company_name company_name=elems_company_name[0].text return company_name def Get_main_product(elems,i): elems_main_product=elems[i].select("li") if len(elems_main_product)==0: #如果找不到元素,则空起 main_product="" return main_product main_product=elems_main_product[1].text.strip(" ") return main_product def Get_phone_address(elems,i): elems_contact=elems[i].select(".site_l") content_contact=elems_contact[0].text content_contact1=content_contact.strip(" ") content_contact2=content_contact1.strip(" ") list_content_contact=content_contact2.split(" ") #有时候信息会缺失,用正则表达式筛选text内容 if len(list_content_contact)==2: phone=list_content_contact[0] address=list_content_contact[1] if len(list_content_contact)==1: content=list_content_contact[0] if "地址" in content: address=content phone="" if "电话" in content: phone=content address="" phone_address=(phone,address) return phone_address #获取每一页20个公司信息存储在list_rows_information def Get_page_information(url): #每一页20个公司信息存储在list_rows_information里面 list_rows_information=[] res=requests.get(url) time.sleep(2) soup=bs4.BeautifulSoup(res.text,"lxml") time.sleep(2) #综合信息 elems=soup.select(".clist_list_content_r") num=len(elems) for i in range(num): try: #公司名称 company_name=Get_company_name(elems,i) #主要产品 main_product=Get_main_product(elems,i) #联系方式 phone_address=Get_phone_address(elems,i) phone=phone_address[0] address=phone_address[1] list_rows_information.append([company_name,main_product,phone,address]) except: print("error at:",i) continue return list_rows_information #把一页内容写入csv文档 ,list_tableContent为二维列表[[a],[b],[c]] def Write_table_to_csv(url): list_tableContent=Get_page_information(url) fileName=os.path.splitext(url)[0][-3:]+".csv" # 1.csv ''' fileName=os.path.splitext(url)[0][-3:]+".csv" fileName Out[27]: 'p12.csv' ''' #对列表格式修改,字符串写入的格式不对 file=open(fileName,'w',newline='') writer1=csv.writer(file) writer1.writerows(list_tableContent) file.close() #写入所有文件 def Write_allTables_to_csvs(list_pages): for i in range(start_page,pages_shanghai): try: Write_table_to_csv(i) time.sleep(random.randint(30,31)) except: print("error at:",i) continue #获取截取数 def Step(urls_list): step=len(urls_list)/15.0 step=int(round(step,0)) return step #采集某范围网址的公司数据 def download_range(start,end): urls_list_range1=list_pages[start:end] for url in urls_list_range1: try: Write_table_to_csv(url) except: bad_urls.append(url) continue #print("well Done") #主函数 list_pages=Get_sites(site_shanxi,pages_shanxi) step=Step(list_pages) #生产所有csv文件,单线程采集 #Write_allTables_to_csvs(list_pages) downloadThreads = [] # a list of all the Thread objects for i in range(0, len(list_pages), step): # loops 14 times, creates 14 threads downloadThread = threading.Thread(target=download_range, args=(i, i +step)) downloadThreads.append(downloadThread) downloadThread.start() # Wait for all threads to end. for downloadThread in downloadThreads: downloadThread.join() print('Done.') ''' 测试 #downloadThread = threading.Thread(target=download_range, args=(10, 12)) #downloadThread.start() downloadThread = threading.Thread(target=download_range, args=(12, 14)) downloadThread.start() downloadThread = threading.Thread(target=download_range, args=(14, 16)) downloadThread.start() i=3 res=requests.get(list_pages[i]) soup=bs4.BeautifulSoup(res.text,"lxml") elems=soup.select(".clist_list_content_r") #联系方式 elems_contact=elems[2].select(".site_l") content_contact=elems_contact[0].text content_contact1=content_contact.strip(" ") content_contact2=content_contact1.strip(" ") list_content_contact=content_contact2.split(" ") #有时候信息会缺失,用正则表达式筛选text内容 if len(list_content_contact)==2: phone=list_content_contact[0] address=list_content_contact[1] if len(list_content_contact)==1: content=list_content_contact[0] if "地址" in content: address=content phone=[] if "电话" in content: phone=content address=[] '''