• 爬取学习 屠戮盗版天堂


    今天学习了屠戮盗版天堂来爬取信息:

    代码

    import requests
    import re
    url = "https://www.dy2018.com/"
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'
    }
    resp = requests.get(url,verify=False,headers = headers)
    resp.encoding = 'gb2312'
    #print(resp.text)
    obj = re.compile(r"2021必看热片.*?<ul>(?P<ul>.*?)</ul>",re.S)
    obj2 = re.compile(r"<a href='(?P<href>.*?)'",re.S)
    obj3 = re.compile(r'◎片  名(?P<name>.*?)<br />.*?<td style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<xia>.*?)">',re.S)
    result = obj.finditer(resp.text)
    child_href_list = []
    for it in result:
    ul = it.group('ul')
    result2 = obj2.finditer(ul)
    for itt in result2:
    child_href = url+itt.group('href').strip('/')
    child_href_list.append(child_href)
    for href in child_href_list:
    resp2 = requests.get(href,verify = False)
    resp2.encoding = 'gb2312'
    result3 = obj3.search(resp2.text)
    print(result3.group("name"))
    print(result3.group("xia"))








  • 相关阅读:
    【程序25】
    【程序24】
    【程序23】
    【程序22】
    【程序21】
    【程序20】
    【程序19】
    【程序18】
    string用法总结
    快速排序
  • 原文地址:https://www.cnblogs.com/092e/p/14955017.html
Copyright © 2020-2023  润新知