#!/usr/local/bin/python3.7 """ @File : xicidaili.py @Time : 2020/06/02 @Author : Mozili """ import urllib.request import urllib.parse from lxml import etree import random import time def handler_request(url): # 请求头 headers = { 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36' } # 创建请求 req = urllib.request.Request(url=url, headers=headers) # 发送请求 res = urllib.request.urlopen(req) # 获取内容 cot = res.read().decode() return cot def preserve_data(ips, ports, types): for ip in ips: for i in range(len(ports)): for j in range(len(types)): str = types[j] + ' ' + ip + ':' + ports[i] + ' ' # 删除列表中第一个元素 del types[0] # print(types) del ports[0] # print(ports) with open('Reptile/daili.txt', 'a', encoding='utf-8') as fp: fp.write(str) break break def download_content(tree): # 获取ip ips = tree.xpath("//tr[@class='odd']/td[2]/text()") # print(ips) # 获取端口 ports = tree.xpath("//tr[@class='odd']/td[3]/text()") # print(ports) # 获取类型 types = tree.xpath("//tr[@class='odd']/td[6]/text()") # print(types) # 保存数据到txt文档 preserve_data(ips, ports, types) if __name__ == "__main__": # 输入页码 start_page = int(input('请输入起始页码:')) end_page = int(input('请输入结束页码:')) # url列表 url_list= [ 'https://www.xicidaili.com/nn/', 'https://www.xicidaili.com/nt/', 'https://www.xicidaili.com/wn/', 'https://www.xicidaili.com/wt/', 'https://www.xicidaili.com/qq/' ] for url in url_list: for page in range(start_page, end_page+1): new_url = url + str(page) # print(url) # 创建请求 content = handler_request(new_url) # print(content) time.sleep(1) # 创建对象,网络文件 tree = etree.HTML(content) # 开始爬取内容 download_content(tree)