• 爬虫练习一


    # -*-coding:utf-8-*-
    import requests
    import re
    from bs4 import BeautifulSoup


    def get_encoding(response):
    """获取页面编码"""
    encoding = response.apparent_encoding
    if encoding in ("ISO-8859-5", "ptcp154"):
    ret = re.search(r'charset=.*"', response.text)
    encoding = "".join(ret.group().split("=")[1][:-1])
    if encoding in ["GB2312", "GBK"]:
    encoding = "GB18030"
    return encoding


    def get_data(url):
    # 获取页面html
    headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6",
    "Cache-Control": "max-age=0",
    "Host": "www.xmusic.io",
    "Cookie": "PHPSESSID=ln2a8hdjm5u8m94te8gpjjk1r6; __gads=ID=15a6e42d58fd3651-2261f4c943d300e5:T=1653040308:RT=1653040308:S=ALNI_MaqWHa2FiTrTRv6PsmJmYenFEvZVA; __gpi=UID=0000059a0ae692a3:T=1653040308:RT=1653040308:S=ALNI_MbgmfdzvyWFPyflOG1KicvIH5jhJw; usercode=1001180AZE212ECCZ842FDF5AFZ",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    encoding = get_encoding(response)
    response.encoding = encoding
    text = response.content.decode(encoding)
    soup = BeautifulSoup(text, "lxml")
    ul_html = soup.find("ul", attrs={"class": "song-list"})
    li_set = ul_html.find_all("li")
    song_names = []
    for index, item in enumerate(li_set):
    if index > 0:
    song_name = item.find("div", attrs={"class": "song-tit"}).text
    art_name = item.find("div", attrs={"class": "art-name"}).text
    song_names.append({"song_name": song_name, "art_name": art_name})
    return song_names



    if __name__=="__main__":
    mun = {
    "2017": [item+1 for item in range(52)],
    "2018": [item+1 for item in range(52)],
    "2019": [item+1 for item in range(52)],
    "2020": [item+1 for item in range(52)],
    "2021": [item+1 for item in range(52)],
    "2022": [1,2,3,4,5,6,7,8,9,10, 11, 12,13,14,15,16,17,18,20],
    }

    f = open("排行榜.txt", "w+")
    for x in mun.keys():
    for i in mun[x]:
    url = "xxxx/charts/lists/10/%s/%s" % (str(x), str(i))
    print("当前解析url:%s" % url)
    data = get_data(url)
    for item in data:
    a = "/t".join(["%s第%s期" % (x, str(i)), item["song_name"], item["art_name"]])
    print("正在解析:%s第%s期%s" % (x, str(i), item["song_name"]))
    f.write(a + "\n")
    f.close()
  • 相关阅读:
    windows下安装elasticsearch和elasticsearch-head
    Java学习第四天之标识符与关键字
    Java学习第三天之注释
    Java学习第二天之Java程序的基本规则
    Java学习第一天之简单了解java语言及开发环境的安装
    rhel7下安装EPEL源
    Mac OS Catalina 如何删除自带的应用
    jetbrains全系列可用2018
    Windows中zabbix-agent的安装与卸载
    jumpserver修改账号密码以及jumpserver安装教程
  • 原文地址:https://www.cnblogs.com/fuchenjie/p/16300774.html
Copyright © 2020-2023  润新知