• 阳光高考的问题


    import requests
    import time
    from lxml import etree

    def get_html(url): # 请求页面
    try:
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
    res = requests.get(url, headers = headers)
    res.encoding = res.apparent_encoding
    if res.status_code == 200:
    html = res.text
    return html
    else:
    time.sleep(0.1)
    return get_html(url)
    except Exception as e: # except BaseException 这个也可以 e是打印出错误的原因
    print("问题是", e)
    pass

    def parse(html):
    #print(html)
    r = etree.HTML(html)

    #装详细列表的url
    list_detail = []

    #基础的url
    base_url = 'https://gaokao.chsi.com.cn'

    #一个列表页面的信息
    ppp_ = r.xpath("//div[@class='yxk-table']//text()")

    #拿到所有的半截url
    list_url = r.xpath("//div[@class='yxk-table']//td[@class='js-yxk-yxmc']/a/@href")

    #拼接所有的url
    for url in list_url:
    detail_url = base_url + url
    list_detail.append(detail_url)

    #返回所有的url
    return list_detail
    def url_join():

    url_start = 'https://gaokao.chsi.com.cn/sch/search--ss-on,searchType-1,option-qg,start-'
    url_end = '.dhtml'
    url_list = []
    for i in range(1,139):
    url_num = 20 * i - 20
    url = url_start + str(url_num) + url_end
    url_list.append(url)

    return url_list
    if __name__ == '__main__':
    # url_list = url_join()
    # print(url_list)
    #
    # for url in url_list:
    #
    # #访问
    # html = get_html(url)
    # parse(html)

    url = 'https://gaokao.chsi.com.cn/sch/search--ss-on,searchType-1,option-qg,start-0.dhtml'
    html = get_html(url)
    url_list = parse(html)
    print(url_list)
  • 相关阅读:
    Python核心编程——正则表达式
    Python 随笔之Redis
    我的第一个Python随笔
    python练习题-day20
    python练习题-day19
    python练习题-day18
    python练习题-day16
    python练习题-day15
    python练习题-day14
    python练习题-day13
  • 原文地址:https://www.cnblogs.com/yuanjia8888/p/11113859.html
Copyright © 2020-2023  润新知