• lxml_time_代理


      1 import requests
      2 from pyquery import PyQuery as pq
      3 import json
      4 import jsonpath
      5 from lxml import etree
      6 import os
      7 import re
      8 import time
      9 
     10 html = '''
     11 <div>
     12     <ul>
     13          <li class="item-0">first item</li>
     14          <li class="item-1"><a href="link2.html">second item</a></li>
     15          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
     16          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
     17          <li class="item-0"><a href="link5.html">fifth item</a></li>
     18      </ul>
     19 </div>
     20 '''
     21 
     22 # html = requests.get('http://news.4399.com/gonglue/lscs/kptj/').content.decode('gbk')
     23 num = 0
     24 # def pq方法(url):
     25 #     global num
     26 #     html=requests.get(url).content.decode('gbk')
     27 #     doc = pq(html)
     28 #     items = doc('#dq_list > li').items()
     29 #     # print(doc)
     30 #     # print(type(doc))
     31 #     for item in items:
     32 #         url=item.find('img').attr('lz_src')
     33 #         num+=1
     34 #         print(str(num),url)
     35 #         url_content=requests.get(url).content
     36 #         name = item.find('.kp-name').text()
     37         
     38 #         with open('e:/py3/002/'+'{:0>4}'.format(str(num))+name+'.jpg','wb') as file:
     39 #             file.write(url_content)
     40 #         # print(url,name)
     41 
     42 def transformCodec(re_data):#ascii (gbk) 转 unicode
     43     try:
     44         re_data = re_data.decode('gbk')
     45     except Exception as error:
     46         print (error)
     47         print ('delete illegal string,try again...')
     48         
     49         pos = re.findall(r'decodebytesinposition([d]+)-([d]+):illegal',str(error).replace(' ',''))
     50         if len(pos)==1:
     51             re_data = re_data[0:int(pos[0][0])]+re_data[int(pos[0][1]):]
     52             re_data = transformCodec(re_data)
     53             return re_data
     54     return re_data
     55 
     56 
     57 def lxml方法(url):
     58     global num
     59     header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6756.400 QQBrowser/10.3.2473.400'}
     60     content=requests.get(url,headers=header).content
     61     html=content.decode('utf-8')
     62     # print(html.status_code)
     63     # print(content)
     64     # print(html)    
     65     r=etree.HTML(html)
     66     # items=r.xpath("//div[@class='box10-content']//ul[@id='dq_list']/li/a/img/@lz_src")
     67     items=r.xpath("//div[@id='list']/table//tr")
     68     # print(items)
     69     for item in items:
     70         dl_ip=item.xpath("./td[1]/text()")
     71         dl_port=item.xpath("./td[2]/text()")
     72         dl_name=item.xpath("./td[5]/text()")
     73         num+=1
     74         dl_ip=dl_ip[0]+":" if len(dl_ip)>=1 else ''
     75         dl_port=dl_port[0]+"#" if len(dl_port)>=1 else ''
     76         dl_name=dl_name[0] if len(dl_name)>=1 else ''
     77         
     78         # print(len(dl_ip))
     79         # print(dl_ip)
     80         # print(r'{}{}{}'.format(dl_ip,dl_port,dl_name))
     81         with open("proxy.txt",'a',encoding='utf-8') as file:
     82             file.write('{}{}{}
    '.format(dl_ip,dl_port,dl_name))
     83         # lzcontent=requests.get(lzsrc).content
     84         # with open('e:/py3/004/'+'{:0>4}'.format(str(num))+'_'+kpname+'.jpg','wb')as file:
     85         #     file.write(lzcontent)
     86 
     87 
     88 
     89 
     90 
     91 if __name__ == '__main__':
     92     with open("proxy.txt", 'w', encoding='utf-8') as file:
     93             file.write(str(time.localtime()[0])+'_'+str(time.localtime()[1])+'_'+str(time.localtime()[2])+'_采集:
    ')
     94     # url='https://www.kuaidaili.com/free/inha/1/'
     95     for i in range(1,11):
     96         print(''+str(i)+'次:
    ')
     97         url2 = r'https://www.kuaidaili.com/free/inha/'+str(i)+r'/'
     98         print(url2)
     99         lxml方法(url2)
    100         time.sleep(5)
    101 
    102     # header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6756.400 QQBrowser/10.3.2473.400'}
    103     # pq方法()
    104     # print(str(time.localtime()[0])+'_'+str(time.localtime()[1])+'_'+str(time.localtime()[2]))
    105     print(str(num)+' ok!')
    106 
    107 
    108     # 创建目录
    109     '''
    110     for dirnum in range(1,100):
    111         dirnum2='{:0>3}'.format(str(dirnum))
    112         mkpath="e:\py3\{}\".format(dirnum2)
    113         print(mkpath)
    114         print('已存在!') if os.path.exists(mkpath) else os.makedirs(mkpath)
    115     '''
  • 相关阅读:
    组装query,query汇总,query字段
    POJ 1276, Cash Machine
    POJ 1129, Channel Allocation
    POJ 2531, Network Saboteur
    POJ 1837, Balance
    POJ 3278, Catch That Cow
    POJ 2676, Sudoku
    POJ 3126, Prime Path
    POJ 3414, Pots
    POJ 1426, Find The Multiple
  • 原文地址:https://www.cnblogs.com/pscc/p/9866225.html
Copyright © 2020-2023  润新知