• python 网页爬虫,带登陆信息


    注意点:
    1. 用Fiddler抓取登陆后的headers,cookies;
    2. 每抓取一次网页暂停一点时间防止反爬虫;
    3. 抓取前,需要关闭Fiddler以防止端口占用.

    还需解决的问题:

    爬取记录较多时,会触发反爬虫机制。

    用Fiddler抓取登陆后的headers,cookies


    也可使用火狐F12查看



    #-*- coding: utf-8 -*-  
    import sys  
    import time  
    import urllib  
    import bs4  
    import re  
    import random  
    
    import requests  
    
    
    def main(startUrl):
        print(startUrl)
        
        global csvContent
     
        headers = {'Accept': 'text/html, application/xhtml+xml, */*',  
                   'Accept-Encoding':'gzip, deflate',  
                   'Accept-Language':'zh-CN',
                   'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'  
                   }  
      
        cookies = {
                   '_csrf':'iN90P1mtdXxv/ZWpt8W8kg==',  
                   '_csrf_bk':'b095b5ac898229ebf3adc8f0e901523a',   
                   'aliyungf_tc':'AQAAAAoHdhpO9Q4AHJUE2sFxGtgWCuH9',  
                   'auth_token':'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODU1MDEzNTUyMSIsImlhdCI6MTUxNzE5MTI3OSwiZXhwIjoxNTMyNzQzMjc5fQ.z9l-sSAyPlLFsD97Yrs7khD1dRBCyyByb-sijUgorQzgR5HdVykD1_W_gn8R2aZSUSRhR_Dq0jPNEYPJlI22ew',
                   'bannerFlag':'true',
                   'csrfToken':'9_lfoqS9eAThxvDa8XjDHA6B',
                   'Hm_lpvt_e92c8d65d92d534b0fc290df538b4758':'1517191269',
                   'Hm_lvt_e92c8d65d92d534b0fc290df538b4758':'1516864063',
        
                   'OA':'TkU7nzii8Vwbw4JYrV6kjTg0WS645VnS6CIervVVizo=',   
                   'ssuid':'360989088',  
                   'TYCID':'709b5a10019e11e89c185fb756815612',  
                   'tyc-user-info':'%257B%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODU1MDEzNTUyMSIsImlhdCI6MTUxNzE5MTI3OSwiZXhwIjoxNTMyNzQzMjc5fQ.z9l-sSAyPlLFsD97Yrs7khD1dRBCyyByb-sijUgorQzgR5HdVykD1_W_gn8R2aZSUSRhR_Dq0jPNEYPJlI22ew%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252218550135521%2522%257D',
                   'undefined':'709b5a10019e11e89c185fb756815612'   
                   
                   }  
      
        resultPage = requests.get(startUrl, headers= headers, cookies = cookies)
        
        randomTime= random.random()*10+5
        print('randomTime    '+str(randomTime))
        time.sleep(randomTime)  
        
        soup = bs4.BeautifulSoup(resultPage.text,'html.parser')
        
        industry = soup.find_all(attrs={'class': 'in-block overflow-width vertival-middle sec-c2'})[0].string;
        
        companys= soup.find_all(attrs={'class': 'search_right_item ml10'})
        
        for company in companys:
            tempCsvContent=''
            tempCsvContent+=industry+','
            tempCsvContent+=company.contents[0].a.string+','
            
    #         if(company.contents[0].a.string=='昆山市大千园艺场'):
    #             break;
            
            for child in company.contents[1].div.children:
                content= str(child.get_text);
                
                if None!=re.search("法定代表人",content):
                    try:
                        tempCsvContent+=child.a.string+','
                    except:
                        tempCsvContent+=','    
                elif None!=re.search("注册资本",content):
                    try:
                        tempCsvContent+=child.span.string+','
                    except:
                        tempCsvContent+=','    
                elif None!=re.search("注册时间",content):
                    try:
                        tempCsvContent+=child.span.string+','
                    except:
                        tempCsvContent+=','    
                elif None!=re.search("江苏",content): 
                    try:
                        tempCsvContent+=re.match('^.*?f20">(d+).*$',content).group(1)+','
                    except:
                        tempCsvContent+=','    
                else:
                    None
                    
                    
            try:
                tempCsvContent+=company.contents[0].a.attrs['href']  +',' 
                
                link = company.contents[0].a.attrs['href']
                
                linkResult = requests.get(link, headers= headers, cookies = cookies)
                
                randomTime2= random.random()*10+5
                print('randomTime 2    '+str(randomTime2)+'        '+link)
                time.sleep(randomTime2)  
                
                
                linkSoup = bs4.BeautifulSoup(linkResult.text,'html.parser')
                
                
                location = linkSoup.find_all(attrs={'colspan': '4'})[0].text.replace('附近公司','');
                tempCsvContent+=location+',';
                
                selfRisk = linkSoup.find(attrs={'class': 'new-err selfRisk pl5 pr5'}).string;
                tempCsvContent+=selfRisk+',';
                
                roundRisk = linkSoup.find(attrs={'class': 'new-err roundRisk pl5 pr5'}).string;
                tempCsvContent+=roundRisk+',';
                
                riskItems = linkSoup.find(attrs={'class': 'navigation new-border-top new-border-right new-c3 js-company-navigation'}).find(attrs={'class': 'over-hide'}).find_all(attrs={'class': 'float-left f14 text-center nav_item_Box'});
                
                for content in riskItems[2].contents[1]:
                    value = str(content)
                    try:    
                        if('<span class="c9">' in value):
                            tempCsvContent+=content.span.string+',';
                        else:
                            tempCsvContent+='0'+',';
                    except:
                        tempCsvContent+='0'+',';
                 
                 
                for content in riskItems[3].contents[1]:
                    value = str(content)
                    try:    
                        if('<span class="c9">' in value):
                            tempCsvContent+=content.span.string+',';
                        else:
                            tempCsvContent+='0'+',';
                    except:
                        tempCsvContent+='0'+',';
                for content in riskItems[4].contents[1]:
                    value = str(content)
                    try:    
                        if('<span class="c9">' in value):
                            tempCsvContent+=content.span.string+',';
                        else:
                            tempCsvContent+='0'+',';
                    except:
                        tempCsvContent+='0'+',';        
                         
                for content in riskItems[5].contents[1]:
                    value = str(content)
                    try:    
                        if('<span class="c9">' in value):
                            tempCsvContent+=content.span.string+',';
                        else:
                            tempCsvContent+='0'+',';
                    except:
                        tempCsvContent+='0'+',';  
                
                tempCsvContent=tempCsvContent.rstrip(',')
                tempCsvContent+='
    '
                
                csvContent+=tempCsvContent
            except:
                print('exception')
                tempCsvContent=''
    
    
    
        
                    
            
            print(csvContent)
            print()
            print()
            print()
            print()
            print()
        
          
          
    if __name__ == '__main__':  
    
        for i in range(3,4):
            
            name=str(i).zfill(2)
            file = open('D:\result-'+name+'.csv','w')
            csvContent='行业分类,企业描述,法定代表人,注册资本,注册时间,分数, 细节, 注册地址, 天眼风险-自身风险, 天眼风险-周边风险, 法律诉讼, 法院公告, 失信人, 被执行人, 开庭公告, 经营异常, 行政处罚, 严重违法,股权出质,动产抵押,欠税公告,司法拍卖, 招投标,债券信息,购地信息,招聘,税务评级,抽查检查,产品信息,进出口信用,资质证书,微信公众号,商标信息,专利,软件著作权,作品著作权,网站备案
    '
            
            for j in range(1,6):
                
    #             randomTime= random.random()*10+10
    #             print('randomTime  header  '+str(randomTime))
    #             time.sleep(randomTime)  
            
                main('https://szh.tianyancha.com/search/oc'+str(i).zfill(2)+'/p'+str(j))
            file.write(csvContent)
            file.close
            csvContent=''
            
        
        print(csvContent)
        
        
        
    

    运行结果示例



    代码链接

    博客地址:https://blog.csdn.net/xiang__liu,https://www.cnblogs.com/xiang--liu/
  • 相关阅读:
    taobao 爬虫基本思路分享
    浅谈python中字典append 到list 后值的改变问题
    滑动验证码验证
    selenium:css_selector定位详解
    01分数规划
    可持久化并查集(草稿)
    后缀自动机求endpos集大小
    伯努利数公式
    HDU 6619 Horse 斜率优化dp
    别人的回文自动机
  • 原文地址:https://www.cnblogs.com/xiang--liu/p/9710397.html
Copyright © 2020-2023  润新知