• 学习进度条100


    import requests
    from lxml import etree
    import pymysql
    
    
    def getdata(url,year):
        # 请求CVPR主页
        page_text = requests.get(url).text
        parser = etree.HTMLParser(encoding="utf-8")
        tree = etree.HTML(page_text, parser=parser)
    
        # 爬取论文连接
        hrefs = tree.xpath('//dt[@class="ptitle"]/a/@href')
        #https://openaccess.thecvf.com/content_cvpr_2018/html/Yang_Learning_Face_Age_CVPR_2018_paper.html
        print(len(hrefs))
    
        # 爬取论文信息
        titles = []
        pdfs = []
        abstracts = []
        authors = []
        keywords = []
    
        for href in hrefs:
            db = pymysql.connect(host="gongyunlong.mysql.rds.aliyuncs.com", user="g2431", password="Gg12512544",
                                 database="cvpr1")
    
            href = "https://openaccess.thecvf.com/" + href
            page_text = requests.get(href).text
            tree_link = etree.HTML(page_text, parser=parser)
    
            title = tree_link.xpath('/html/body/div/dl/dd/div[@id="papertitle"]/text()')
            title[0] = title[0].strip()
            titles += title
    
            title[0] = title[0].replace(":", "")
            words = title[0].split()
            keyword = ""
            for word in words:
                if checkword(word):
                    save_keywords(pymysql.connect(host="gongyunlong.mysql.rds.aliyuncs.com", user="g2431", password="Gg12512544",
                                 database="cvpr1"), word)
                    keyword += word + " "
    
            keywords.append(keyword)
    
            pdf = tree_link.xpath('/html/body/div/dl/dd/a[contains(text(),"pdf")]/@href')
            pdf[0] = pdf[0].replace("../../", "https://openaccess.thecvf.com/")
            pdfs += pdf
    
            abstract = tree_link.xpath('/html/body/div/dl/dd/div[@id="abstract"]/text()')
            abstract[0] = abstract[0].strip()
            abstracts += abstract
    
            author = tree_link.xpath('/html/body/div/dl/dd/div/b/i/text()')
            authors += author
    
            # print(title)
            # print(author)
            # print(pdf)
            # print(abstract)
    
            save(db, title[0], author[0], abstract[0], href, keyword,year)
    
        print(titles)
        print(hrefs)
        print(authors)
        print(abstracts)
        print(pdfs)
    
    
    def save(db, title, author, abstract, link, keyword,year):
        # 使用cursor()方法获取操作游标
        cursor = db.cursor()
    
        # SQL 插入语句
        sql = "INSERT INTO papers(title, authors, abstract_text, original_link, keywords,year) 
               VALUES ('%s', '%s',  '%s',  '%s', '%s','%d')" % 
              (title, author, abstract, link, keyword,year)
        try:
            # 执行sql语句
            cursor.execute(sql)
            # 执行sql语句
            db.commit()
        except:
            # 发生错误时回滚
            db.rollback()
    
        # 关闭数据库连接
        db.close()
    
    
    def save_keywords(db, keyword):
        # 使用cursor()方法获取操作游标
        cursor = db.cursor()
    
        # SQL 插入语句
        sql = "INSERT INTO keywords(keyword) VALUES ('%s')" % (keyword)
        try:
            # 执行sql语句
            cursor.execute(sql)
            # 执行sql语句
            db.commit()
        except:
            # 发生错误时回滚
            db.rollback()
    
        # 关闭数据库连接
        db.close()
    
    
    def checkword(word):
        invalid_words = ['the', 'a', 'an', 'and', 'by', 'of', 'in', 'on', 'is', 'to', "as", "from", "for", "with", "that",
                         "have", "by", "on", "upon", "about", "above", "across", "among", "ahead", "after", "a",
                         "analthough", "at", "also", "along", "around", "always", "away", "anyup", "under", "untilbefore",
                         "between", "beyond", "behind", "because", "what", "when", "would", "could", "who", "whom", "whose",
                         "which", "where", "why", "without", "whether", "down", "during", "despite", "over", "off", "only",
                         "other", "out", "than", "the", "thenthrough", "throughout", "that", "these", "this", "those",
                         "there", "therefore", "some", "such", "since", "so", "can", "many", "much", "more", "may", "might",
                         "must", "ever", "even", "every", "each" ,"with","A","With","From","Question",'Question','question',"question","Questions",'Questions','questions',"questions"]
        if word.lower() in invalid_words:
            return False
        else:
            return True
    
    
    if __name__ == '__main__':
        getdata("https://openaccess.thecvf.com/CVPR2018?day=2018-06-19",2018)
        getdata("https://openaccess.thecvf.com/CVPR2018?day=2018-06-20",2018)
        getdata("https://openaccess.thecvf.com/CVPR2018?day=2018-06-21",2018)
        getdata("https://openaccess.thecvf.com/CVPR2019?day=2019-06-18",2019)
        getdata("https://openaccess.thecvf.com/CVPR2019?day=2019-06-19",2019)
        getdata("https://openaccess.thecvf.com/CVPR2019?day=2019-06-20",2019)
        getdata("https://openaccess.thecvf.com/CVPR2020?day=2020-06-16",2020)
        getdata("https://openaccess.thecvf.com/CVPR2020?day=2020-06-17",2020)
        getdata("https://openaccess.thecvf.com/CVPR2020?day=2020-06-18",2020)

    提取热词

  • 相关阅读:
    突然想到一个问题:有关 cqrs 分离
    Oracle 12C 数据库安装与配置
    Android 绑定服务的作用 (参考,不确定他说的是不是对的)
    SSM框架整合(一)
    常见数据库优化方案(九)
    大量文件存储
    MyBatis 参数传递小知识(划重点)|划掉 MyBatis 常见小debug
    MyBatis 自动代码生成器
    常见数据库优化方案(八)
    常用数据库优化方案(五)
  • 原文地址:https://www.cnblogs.com/hhw12345/p/14910802.html
Copyright © 2020-2023  润新知