• 化工最新采集3——多线程


    sklearn实战-乳腺癌细胞数据挖掘(博主亲自录制视频)

    多线程采集,1秒搞定

    # -*- coding: utf-8 -*-
    """
    Created on Tue May 17 16:26:31 2016
    采集下来excel文件小于2kb的有问题
    
    @author: Administrator
    """
    
      
    import requests,bs4,csv,time,random,os,threading
      
    #存放所有二级网址
    fileName='combinedFile.csv'
    #存放二级网址目录
    bad_urls=[]
    site_hubei="http://china.guidechem.com/suppliers/list_catid-21_area-%E6%B9%96%E5%8C%97"
    site_guangdong="http://china.guidechem.com/suppliers/list_catid-21_area-广东"
    site_shanghai="http://china.guidechem.com/suppliers/list_catid-21_area-%E4%B8%8A%E6%B5%B7"
    site_shanxi="http://china.guidechem.com/suppliers/list_catid-21_area-陕西"
    pages_hubei=31
    pages_guangdong=21
    pages_shanghai=34
    pages_shanxi=15
    start_page=0
     
    def Get_sites(site,pages):
        list_pages=[]
        for page in range(1,pages+1):
            thePage=site+"-"+"p"+str(page)+".html"
            list_pages.append(thePage)
         
        return list_pages
    
    
    def Get_company_name(elems,i):
        elems_company_name=elems[i].select(".dblue")
        if len(elems_company_name)==0:   #如果找不到元素,则空起
            company_name=""
            return company_name
        company_name=elems_company_name[0].text
        return company_name
    
    def Get_main_product(elems,i):
        elems_main_product=elems[i].select("li")
        if len(elems_main_product)==0:   #如果找不到元素,则空起
            main_product=""
            return main_product
        main_product=elems_main_product[1].text.strip("
    ")
        return main_product
        
    def Get_phone_address(elems,i):
        elems_contact=elems[i].select(".site_l")
        content_contact=elems_contact[0].text
        content_contact1=content_contact.strip("
    
    	
    ")
        content_contact2=content_contact1.strip("
    ")
        list_content_contact=content_contact2.split("
    
    ")
        #有时候信息会缺失,用正则表达式筛选text内容
        if len(list_content_contact)==2:
            phone=list_content_contact[0]
            address=list_content_contact[1]
        if len(list_content_contact)==1:
            content=list_content_contact[0]
            if "地址" in content:
                address=content
                phone=""
            if "电话" in content:
                phone=content
                address=""
        phone_address=(phone,address)
        return phone_address
     
    #获取每一页20个公司信息存储在list_rows_information
    def Get_page_information(url):
        #每一页20个公司信息存储在list_rows_information里面
        list_rows_information=[]
        res=requests.get(url)
        time.sleep(2)
        soup=bs4.BeautifulSoup(res.text,"lxml")
        time.sleep(2)
        #综合信息
        elems=soup.select(".clist_list_content_r")
        num=len(elems)
        for i in range(num):
            try:
                #公司名称
                company_name=Get_company_name(elems,i)
                
                #主要产品
                main_product=Get_main_product(elems,i)
                
                #联系方式
                phone_address=Get_phone_address(elems,i)
                phone=phone_address[0]
                address=phone_address[1]
                
                list_rows_information.append([company_name,main_product,phone,address])
            except:
                print("error at:",i)
                continue
        return list_rows_information
     
     
      
    #把一页内容写入csv文档 ,list_tableContent为二维列表[[a],[b],[c]]
    def Write_table_to_csv(url):
        list_tableContent=Get_page_information(url)
        fileName=os.path.splitext(url)[0][-3:]+".csv"  #  1.csv
        '''
        fileName=os.path.splitext(url)[0][-3:]+".csv"
        fileName
        Out[27]: 'p12.csv'
        '''
        #对列表格式修改,字符串写入的格式不对
        file=open(fileName,'w',newline='')
        writer1=csv.writer(file)
        writer1.writerows(list_tableContent)
        file.close()
          
    #写入所有文件
    def Write_allTables_to_csvs(list_pages):
        for i in range(start_page,pages_shanghai):
            try:
                Write_table_to_csv(i)
                time.sleep(random.randint(30,31))
            except:
                print("error at:",i)
                continue
            
    #获取截取数
    def Step(urls_list):
        step=len(urls_list)/15.0
        step=int(round(step,0))
        return step
            
    #采集某范围网址的公司数据
    def download_range(start,end):
        urls_list_range1=list_pages[start:end]
        for url in urls_list_range1:
            try:
                Write_table_to_csv(url)
            except:
                bad_urls.append(url)
                continue
        #print("well Done")        
            
    #主函数
    list_pages=Get_sites(site_shanxi,pages_shanxi)
    step=Step(list_pages)
    #生产所有csv文件,单线程采集        
    #Write_allTables_to_csvs(list_pages)
    
    
    downloadThreads = [] # a list of all the Thread objects
    for i in range(0, len(list_pages), step): # loops 14 times, creates 14 threads
        downloadThread = threading.Thread(target=download_range, args=(i, i +step))
        downloadThreads.append(downloadThread)
        downloadThread.start()
     
    # Wait for all threads to end.
    for downloadThread in downloadThreads:
        downloadThread.join()
    print('Done.')
    
    
    '''
    测试
    
    #downloadThread = threading.Thread(target=download_range, args=(10, 12))
    #downloadThread.start()
    
    downloadThread = threading.Thread(target=download_range, args=(12, 14))
    downloadThread.start()
    
    
    downloadThread = threading.Thread(target=download_range, args=(14, 16))
    downloadThread.start()
    i=3
    res=requests.get(list_pages[i])
    soup=bs4.BeautifulSoup(res.text,"lxml")
    elems=soup.select(".clist_list_content_r")
    #联系方式
    elems_contact=elems[2].select(".site_l")
    content_contact=elems_contact[0].text
    content_contact1=content_contact.strip("
    
    	
    ")
    content_contact2=content_contact1.strip("
    ")
    list_content_contact=content_contact2.split("
    
    ")
    
    #有时候信息会缺失,用正则表达式筛选text内容
    if len(list_content_contact)==2:
        phone=list_content_contact[0]
        address=list_content_contact[1]
    if len(list_content_contact)==1:
        content=list_content_contact[0]
        if "地址" in content:
            address=content
            phone=[]
        if "电话" in content:
            phone=content
            address=[]
    '''
    

     

     

  • 相关阅读:
    手机号码正则,座机正则,400正则
    Win10 开始运行不保存历史记录原因和解决方法
    Ubuntu 普通用户无法启动Google chrome
    在win10 64位系统安装 lxml (Python 3.5)
    SecureCRT窗口输出代码关键字高亮设置
    【转】win2008 中iis7设置404页面但返回状态200的问题解决办法
    ionic app开发遇到的问题
    Ubuntu 创建文件夹快捷方式
    Ubuntu配置PATH环境变量
    Ubuntu 升级python到3.7
  • 原文地址:https://www.cnblogs.com/webRobot/p/5505125.html
Copyright © 2020-2023  润新知