• 6.4 记录


    1、个人作业2

    数据爬取阶段

    import requests
    from lxml import etree
    import pymysql


    def getdata(url):
    # 请求CVPR主页
    page_text = requests.get(url).text
    parser = etree.HTMLParser(encoding="utf-8")
    tree = etree.HTML(page_text, parser=parser)
    #html
    # 爬取论文连接
    hrefs = tree.xpath('//dt[@class="ptitle"]/a/@href')
    print(len(hrefs))

    # 爬取论文信息
    titles = []
    pdfs = []
    abstracts = []
    authors = []
    keywords = []

    for href in hrefs:
    db = pymysql.connect(host="127.0.0.1", user="root", password="lin0613",
    database="users")

    href = "https://openaccess.thecvf.com/" + href
    page_text = requests.get(href).text
    tree_link = etree.HTML(page_text, parser=parser)

    title = tree_link.xpath('/html/body/div/dl/dd/div[@id="papertitle"]/text()')
    title[0] = title[0].strip()
    titles += title

    title[0] = title[0].replace(":", "")
    words = title[0].split()
    keyword = ""
    for word in words:
    if checkword(word):
    save_keywords(pymysql.connect(host="127.0.0.1", user="root", password="lin0613",database="users"), word)
    keyword += word + " "

    keywords.append(keyword)

    pdf = tree_link.xpath('/html/body/div/dl/dd/a[contains(text(),"pdf")]/@href')
    pdf[0] = pdf[0].replace("../../", "https://openaccess.thecvf.com/")
    pdfs += pdf

    abstract = tree_link.xpath('/html/body/div/dl/dd/div[@id="abstract"]/text()')
    abstract[0] = abstract[0].strip()
    abstracts += abstract

    author = tree_link.xpath('/html/body/div/dl/dd/div/b/i/text()')
    authors += author

    # print(title)
    # print(author)
    # print(pdf)
    # print(abstract)

    save(db, title[0], author[0], abstract[0], href, keyword)

    print(titles)
    print(hrefs)
    print(authors)
    print(abstracts)
    print(pdfs)


    def save(db, title, author, abstract, link, keyword):
    # 使用cursor()方法获取操作游标
    cursor = db.cursor()

    # SQL 插入语句
    sql = "INSERT INTO papers(title, authors, abstract_text, original_link, keywords)
    VALUES ('%s', '%s', '%s', '%s', '%s')" %
    (title, author, abstract, link, keyword)
    try:
    # 执行sql语句
    cursor.execute(sql)
    print("true")
    # 执行sql语句
    db.commit()
    except:
    print("error wenzhang")
    # 发生错误时回滚
    db.rollback()

    # 关闭数据库连接
    db.close()


    def save_keywords(db, keyword):
    # 使用cursor()方法获取操作游标
    cursor = db.cursor()

    # SQL 插入语句
    sql = "INSERT INTO keywords(keyword) VALUES ('%s')" % (keyword)
    try:
    # 执行sql语句
    cursor.execute(sql)
    # 执行sql语句
    print("true")
    db.commit()
    except:
    print("error word")
    # 发生错误时回滚
    db.rollback()

    # 关闭数据库连接
    db.close()


    def checkword(word):
    invalid_words = ['the', 'a', 'an', 'and', 'by', 'of', 'in', 'on', 'is', 'to', "as", "from", "for", "with", "that",
    "have", "by", "on", "upon", "about", "above", "across", "among", "ahead", "after", "a",
    "analthough", "at", "also", "along", "around", "always", "away", "anyup", "under", "untilbefore",
    "between", "beyond", "behind", "because", "what", "when", "would", "could", "who", "whom", "whose",
    "which", "where", "why", "without", "whether", "down", "during", "despite", "over", "off", "only",
    "other", "out", "than", "the", "thenthrough", "throughout", "that", "these", "this", "those",
    "there", "therefore", "some", "such", "since", "so", "can", "many", "much", "more", "may", "might",
    "must", "ever", "even", "every", "each" ,"with","A","With","From"]
    if word.lower() in invalid_words:
    return False
    else:
    return True


    if __name__ == '__main__':
    #getdata("https://openaccess.thecvf.com/CVPR2018?day=2018-06-20")
    getdata("https://openaccess.thecvf.com/CVPR2018?day=2018-06-21")
    getdata("https://openaccess.thecvf.com/CVPR2019?day=2019-06-18")
    #getdata("https://openaccess.thecvf.com/CVPR2019?day=2019-06-19")
    #getdata("https://openaccess.thecvf.com/CVPR2019?day=2019-06-20")
    getdata("https://openaccess.thecvf.com/CVPR2020?day=2020-06-16")
    #getdata("https://openaccess.thecvf.com/CVPR2020?day=2020-06-17")
    #getdata("https://openaccess.thecvf.com/CVPR2020?day=2020-06-18")
    #getdata("https://openaccess.thecvf.com/CVPR2018?day=2018-06-19")



  • 相关阅读:
    ubuntu下查看环境变量
    ubuntu关闭自动更新、打开 ubuntu 的 apport 崩溃检测报告功能
    Ubuntu 配置AP总结
    ubuntu 12.04亮度无法调节和无法保存屏幕亮度解决办法(echo_brightness)
    Ubuntu 13.04 双显卡安装NVIDIA GT 630M驱动
    Linux下添加硬盘,分区,格式化详解
    Eclipse启动分析
    “蚁族” 的生活方式画像
    Ubuntu下的防火墙
    Ubuntu下的杀毒
  • 原文地址:https://www.cnblogs.com/lx06/p/14907379.html
Copyright © 2020-2023  润新知