• python超链接抓取工具


    python实现自动抓取某站点内所有超链接

    (仅供学习使用)

    代码部分

    #!/usr/bin/python
    
    import requests
    import time
    import re
    import sys, getopt  #命令行选项
    from bs4 import BeautifulSoup
    localtime=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) #时间
    z=[] #存取网站
    x=[] #优化网站,去除冗杂部分
    
    def main(argv):
       url = '' #输入的网址
       file_path = '' #保存路径
       try:
          opts, args = getopt.getopt(argv,"hu:f:",["url=","file="])
       except getopt.GetoptError:
          print ('allsite.py -u <url> -f <file>')
          sys.exit(2)
       for opt, arg in opts:
          if opt == '-h': #帮助
             print ('allsite.py -u <url> -f <file>')
             sys.exit()
          elif opt in ("-u", "--url"): #输入网址
             url = arg
             re1 = requests.get(url) #get网站内容
             re1.encoding = "utf-8"
             html = re1.text
             bt = BeautifulSoup(html, 'html.parser', )
             hh = bt.find_all('a') #查找<a>元素
             for site in hh:
                     z.append(site.get('href')) #进一步过滤得到超链接
             for i in z:
                     if (re.match('//www', str(i)) or re.match('www', str(i))): 
                             xx = str(i).replace('//www', 'www', 1)
                             x.append(xx)
                     elif (re.match('http', str(i))): #过滤
                             x.append(str(i))
                     elif (re.match('/', str(i))): #过滤
                             xx = str(i).replace("/", "", 1)
                             if (re.match('/', xx)):
                                     xxx = str(xx).replace("/", "", 1)
                                     x.append(xxx)
                             else:
                                     x.append(url + xx)
                     else: #过滤
                             if (re.search('javascript', str(i)) == None):
                                     x.append(url + str(i))
             print(localtime + "  总共:" + str(len(x)) + "个网址") #输出超链接
             for i in x:
                     print(i)
          elif opt in ("-f", "--file"): #输入保存路径
             file_path = arg
             for i in x: #保存文件
                     with open(file_path, 'a') as file_object:
                             file_object.write(i)
                             file_object.write('
    ')
    
    if __name__ == "__main__":
       main(sys.argv[1:])
  • 相关阅读:
    使用MySQL Workbench建立数据库,建立新的表,向表中添加数据
    IntelliJ IDEA15开发时设置中java complier 的问题
    IntelliJ 15 unmapped spring configuration files found
    Redis 的性能
    SSH框架
    jquery插件模版
    cygwin,在win中开发linux程序
    MinGw与CyGwin
    升级到tomcat8时Artifact SpringMvcDemo:war exploded: Server is not connected. Deploy is not
    Socket连接超时(转)
  • 原文地址:https://www.cnblogs.com/WTa0/p/11810668.html
Copyright © 2020-2023  润新知