python3 获取博彩网站页面下所有域名（批量）

已有的域名信息

详细实现过程如下

#!/usr/bin/env python 
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup as Bs4
from urllib.parse import urlparse

headers= {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

#打开域名文件1.txt
def new_url():
    url_list  = []
    bo = open("1.txt","r")
    for i in bo:
        url_list.append(i.replace("
",""))
    return(url_list)


#数据处理
def get_url():
    head_url = new_url()
    num = 0
    for i in head_url: #按行遍历数据
        num = num +1
        print("***********************************"+ i +"***********************************")
        # head_url = "https://www.tkcp.hk/"
        try:
            response = requests.get(url="http://"+i,headers=headers)
            response.encoding = 'gb2312'
            soup = Bs4(response.text,"lxml")
            # print(soup)
            htmls = soup.find_all("a") #获取页面中的所有a标签
            # print(htmls)
            urls = []
            new_urls = []
            for html in htmls:
                url = html.get("href") #获取页面中所有含"href"的字符串
                urls.append(url.replace('
',''))
                qc_urls = set(urls)
            for url in qc_urls: #处理数据，得到域名地址
                if "http" in url:
                    res = urlparse(url)
                    # print("返回对象：", res)
                    # print("域名", res.netloc)
                    domain = res.netloc
                    new_urls.append(domain)
            qc_new_urls = set(set(new_urls))
            #print("***********************************"+num+"***********************************")
            print(set(qc_new_urls)) #去重
            for j in set(qc_new_urls):
                # print(j)
                with open("url_v1.txt","a+",encoding="utf-8") as f:
                    f.write(j+"
")
        except Exception as e:
            print("链接无法访问")
    result_list = []
    result = open("./url_v1.txt","r")  
    for r in result.readlines(): 
        result_list.append(r.replace("
",""))
    for x in set(result_list): #二次数据处理，去掉重复数据
        with open("url_end_V.txt","a+",encoding="utf-8") as f:
            print(x)
            f.write(x+"
")

if __name__=="__main__":
    get_url()

相关阅读:
POJ
Parallel Computing–Cannon算法 (MPI 实现)
POJ
POJ 2240
IOS
iOS
js遍历map匹配数据和js遍历数组匹配map数据
vue v-on:click传递动态参数
vue 权限控制按钮3种样式、内容、以及跳转事件
vue v-show与v-for同时配合v-bind使用并在href中传递多个参数的使用方法

原文地址：https://www.cnblogs.com/dddjh/p/11806085.html