• 慧聪网爬虫


    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    import gevent
    from gevent import monkey;monkey.patch_all()
    import time
    import re
    import random
    
    UA_list = [
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    ,'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; GWX:MANAGED)','Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)','Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)','Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; GWX:MANAGED)']
    
    proxies_list=[{'proxy': 'http:\10.220.70.254:808'}, {'proxy': 'http:\10.221.70.254:808'}, {'proxy': 'http:\10.222.70.254:808'}, {'proxy': 'http:\10.223.70.254:808'}]
    
    headers = {'User-Agent':random.choice(UA_list),'Referer':'http://b2b.hc360.com/'}
    
    def diyu(sheng,shi):
        for i in range(100):
            or_url = 'http://s.hc360.com/?w={}&mc=enterprise&ee={}&z=%D6%D0%B9%FA%3A{}%CA%A1%3A{}'.format(sheng,i+1,sheng,shi)
            res = requests.get(or_url,headers = headers,)
            soup = BeautifulSoup(res.text,'lxml')
            urls = soup.select('dd.til > h3 > a')
            for url in urls:
                return url.get('href')
    
    def url_parser(urld):
        res = requests.get(urld, headers=headers,proxies=random.choice(proxies_list),timeout=60)
        if res.status_code !='404':
            soup = BeautifulSoup(res.text, 'lxml')
            flag = re.findall(r'公司黄页',str(soup))
            if len(flag)>0:
                return url_HYparer(soup)
            else:
                or_url = urld + 'shop/company.html'
                res = requests.get(or_url, headers=headers,proxies=random.choice(proxies_list),timeout=60)
                soup1 = BeautifulSoup(res.text, 'lxml')
                flag1 = re.findall(r'手机极速版',str(soup1))
                flag2 = re.findall(r'未认证 ', str(soup1))
                if len(flag1)>0:
                    return url_SJJSparer(soup1)
                elif len(flag2)>0:
                    return url_uncertifie(soup1)
                else:
                    return url_NSJJSparer(soup1)
    
    def url_NSJJSparer(soup):
    
        data = {
            'conpany_name':soup.select('td.contitlebg > span')[0].text.strip(),
            'name':soup.select('span.bluezzbold.font14')[0].text.strip(),
            'address':soup.select('td.conbg.conbg2 > ul:nth-of-type(1) > li:nth-of-type(2)')[0].get('title'),
            'phone':re.search(r'd{11}|d{4}-d{8}',str(soup)).group()}
        return data
    
    def url_HYparer(soup):
        data = {
        'conpany_name':soup.select('div.sub-info > h1')[0].text,
        'name':soup.select('samp')[0].text,
        'address':soup.select('div.tableCon > div:nth-of-type(2) > ul > li:nth-of-type(3) > span.conRight')[0].text,
        'phone':soup.select('div.tableCon > div:nth-of-type(2) > ul > li:nth-of-type(2) > span.conRight')[0].text
        }
        return data
    
    def url_SJJSparer(soup):
        data = {
            'conpany_name':soup.select('div.ContacCon1 > h3')[0].text.strip(),
            'name':soup.select('div.ContactsName > span > a')[0].text.strip(),
            'address':soup.select('div.ContacCon3 > ul > li:nth-of-type(1) > div.con3Rig')[0].text.strip(),
            'phone':re.search(r'd{11}|d{4}-d{8}',str(soup)).group()}
        return data
    
    def url_uncertifie(soup):
        data = {
            'conpany_name':soup.select('td.contitlebg_1 > span')[0].text.strip(),
            'name':soup.select('span.bluezzbold.font14')[0].text.strip(),
            'address':soup.select('td.conbg.conbg2 > ul:nth-of-type(1) > li:nth-of-type(2)')[0].text.strip(),
            'phone':re.search(r'd{11}|d{4}-d{8}',str(soup)).group()}
        return data
    
    if __name__=='__main__':
        with open('uu.txt', 'r') as f:
            info_total = []
            for i in f:
                try:
                    info_ary = url_parser(i.strip())
                    time.sleep(random.randint(1,5))
                    info_total.append(info_ary)
                    print(len(info_total))
                except Exception as e:
                    print(e, i.strip())
            df = pd.DataFrame(info_total)
            df.to_excel('huicong_beijing.xlsx')
            print('Done')
  • 相关阅读:
    detectron2安装
    Python 读取网络摄像头(IP Camera)
    01【HTML基础】
    安徽京准:北斗卫星同步时钟|北斗同步时钟|NTP网络时钟服务器
    GPS卫星同步时钟,NTP网络同步时钟,北斗时钟服务器(京准)
    医院HIS(LIS)系统时钟同步(NTP网络时间服务器)技术详解
    PTP北斗授时服务器(卫星时钟服务器)助力高考信息化系统
    探索智能驾驶区间测速NTP时钟同步(PTP时间同步)
    京准,NTP授时服务器在安防监控系统应用方案
    京准电钟北斗时钟服务器,GPS网络时间服务器,NTP卫星授时系统
  • 原文地址:https://www.cnblogs.com/Erick-L/p/6945009.html
Copyright © 2020-2023  润新知