• python论文爬取(四)


    app.py

    from flask import Flask, jsonify, render_template, request, json
    
    import mysqlUtil
    
    app = Flask(__name__)
    
    
    @app.route("/c1")
    def hellv():
        return render_template('view.html')
    
    @app.route("/text")
    def hellt():
        return render_template('text.html')
    
    @app.route("/findlunwen")
    def hello():
        return
    
    
    @app.route("/")
    def hellp():
        return render_template('find.html')
    
    
    @app.route("/c2", methods=['POST', 'GET'])
    def wordcloud():
        res0 = []
        res1 = []
        temp = mysqlUtil.select_key()
        print(temp)
        for i in temp:
            res0.append(i[0])  # keyword
            res1.append(i[1])  # value
        return jsonify({"keyword": res0, "value": res1})
    
    
    @app.route("/c3", methods=['POST', 'GET'])
    def select_lunwen():
        res0 = []
        res1 = []
        res2 = []
        res3 = []
        tiaojian = request.args.get("tiaojian")
        firinput = request.args.get("firinput")
        jingzhun = request.args.get("jingzhun")
        # print(tiaojian)
        # print(firinput)
        # print(jingzhun)
        if jingzhun == '精准':
            if tiaojian == '题目':
                temp = mysqlUtil.select_lunwenj('title', firinput)
            elif tiaojian == '摘要':
                temp = mysqlUtil.select_lunwenj('abstract', firinput)
            elif tiaojian == '作者':
                temp = mysqlUtil.select_lunwenj('zuozhe', firinput)
            elif tiaojian == '关键词':
                temp = mysqlUtil.select_lunwenj('abstract', firinput)
    
        else:
            if tiaojian == '题目':
                temp = mysqlUtil.select_lunwenm('title', firinput)
            elif tiaojian == '摘要':
                temp = mysqlUtil.select_lunwenm('abstract', firinput)
            elif tiaojian == '作者':
                temp = mysqlUtil.select_lunwenm('zuozhe', firinput)
            elif tiaojian == '关键词':
                temp = mysqlUtil.select_lunwenm('abstract', firinput)
        for i in temp:
            res0.append(i[0])  # title
            res1.append(i[1])  # link
            res2.append(i[3])  # zuozhe
            res3.append(i[4])  # time
        qw = jsonify({"title": res0, "zuozhe": res2, "time": res3, "lianjie": res1})
        return jsonify({"title": res0, "zuozhe": res2, "time": res3, "lianjie": res1})
    
    
    
    
    if __name__ == '__main__':
        app.run(debug=True, host='127.0.0.1', port='5000')

    Keyword.py

    # -*- coding: utf-8 -*-
    import sys
    
    sys.path.append('../')
    
    import jieba
    import jieba.analyse
    import mysqlUtil
    from optparse import OptionParser
    
    # file_name = "test.txt"
    #
    # content = open(file_name, 'rb').read()
    # content = "Few-shot learning is an important area of research.  Conceptually, humans are readily able to understand new concepts given just a few examples, while in more pragmatic terms, limited-example training situations are common practice. Recent effective approaches to few-shot learning employ a metric-learning framework to learn a feature similarity comparison between a query (test) example, and the few support (training) examples.  However, these approaches treat each support class independently from one another, never looking at the entire task as a whole.  Because of this, they are constrained to use a single set of features for all possible test-time tasks, which hinders the ability to distinguish the most relevant dimensions for the task at hand.  In this work, we introduce a Category Traversal Module that can be inserted as a plug-and-play module into most metric-learning based few-shot learners.  This component traverses across the entire support set at once, identifying task-relevant features based on both intra-class commonality and inter-class uniqueness in the feature space.  Incorporating our module improves performance considerably (5%-10% relative) over baseline systems on both miniImageNet and tieredImageNet benchmarks, with overall performance competitive with the most recent state-of-the-art systems."
    # 10表示输出的前10个
    # tags = jieba.analyse.extract_tags(content, topK=10, withWeight=True)
    #
    # print(tags)
    # print(",".join(tags))
    
    
    def getKey(str):
        counts = {}
        for i in str:
            content = jieba.lcut(i[0])
            for word in content:
                if len(word) == 1 or word in nolist:#单个词不计算在内
                    continue
                else:
                    counts[word]=counts.get(word,0)+1#遍历所有词语,每出现一次其对应值加1
    
        items = list(counts.items())#将键值对转化为列表
        items.sort(key=lambda x:x[1], reverse=True)#根据词语出现的次数进行从大到小的排序
    
        for i in range(20):
            word, count = items[i]
            mysqlUtil.insert_key(word, count)
            print('{0:<5}{1:<5}'.format(word, count))
    
        return items
    
    
    if __name__ == '__main__':
        nolist ={'are','is','am','and','of','but','so','which','where','when','how','what','that','who','whose','in','at','with','of','for','the','a','an','to','on','we','We','this','by','from','our','as','in','The','can','he','He','The','be','In'}
        res = mysqlUtil.select_ab()
        # print(res[0])
        getKey(res)

    lunwenSpideer.py

    # -*- coding:utf-8 -*-
    import requests
    import re
    import json
    import Mysql
    
    headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36"
    }
    
    
    def getData():
        url = "https://openaccess.thecvf.com/menu"
        res = requests.get(url).text.replace('
    ', '').replace('<br>', '')
        # print(res)
        getA = re.compile(r'<dd>(.*?) [<a href="(.*?)">Main Conference</a>]  [<a href="(.*?)/menu.*?">Workshops</a>]</dd>')
        keyA = re.findall(getA, res)
        print("会议有"+str(len(keyA)))
        print(keyA)
        httpList = []
        httpList2 = []
        httpList3 = []
        ht = []
        h = []
        t = []
        temp = []
        for i in keyA:
            h1 = i[0]
            h2 = 'https://openaccess.thecvf.com'+i[1]
            h3 = 'https://openaccess.thecvf.com'+i[2]
            httpList.append([h1, h2, h3])  # 会议题目 链接
        # print(httpList)
        for i in httpList:
                url2 = i[2]+'/menu'
                res2 = requests.get(url2).text.replace('.py', '')
                print(url2)
                getZ = re.compile(r'<dl>(.*?)</dl>', re.DOTALL)
                keyZ = re.findall(getZ, res2)[0]
                # print(keyZ)
                getB = re.compile(r'<a href="/?(?:w+/)?(w+)">(.*?)</a><br><br>.*?</dd>', re.DOTALL)
                keyB = re.findall(getB, keyZ)  # 2
                print(keyB)
                for k in keyB:
                    h1 = i[2]+'/'+k[0]
                    url4 = h1
                    print(h1)
                    res4 = requests.get(url4).text
                    getX = re.compile(r'<dt class="ptitle"><br><a href="(.*?)">')
                    keyX = re.findall(getX, res4)
                    for y in range(len(keyX)):
                        act1 = 'https://openaccess.thecvf.com'+keyX[y]
                        url3 = act1  # 论文链接
                        print(act1)
                        res2 = requests.get(url3).text.replace('
    ', '')
                        getC = re.compile(r'<meta name="citation_pdf_url" content="(.*?)">.*?<div id="abstract">(.*?)</div>.*?authors+=s+{(.*?)}.*?titles+=s+{(.*?)}.*?booktitles+=s+{(.*?)}.*?months+=s+{(.*?)}.*?years+=s+{(.*?)}', re.DOTALL)
                        keyC = re.findall(getC, res2)
                        print(keyC)
                        t1 = keyC[0][2]  # 作者
                        t2 = keyC[0][3]  # 题目
                        t3 = keyC[0][4]  # 书名
                        t4 = keyC[0][5] + ',' + keyC[0][6]  # 日期
                        t5 = keyC[0][1]  # 摘要
                        t6 = keyC[0][0]  # 链接
    
                        temp.append([t1, t2, t3, t4, t5, t6])
                        Mysql.insert_item(temp)
                        temp = []
    
    
    if __name__ == '__main__':
        getData()
  • 相关阅读:
    查看Eclipse版本号的方法
    设置Eclipse的字体风格方式
    又遇两个小异常
    我所推崇的三种心态
    关于javax.servlet.jsp.JspTagException: Don't know how to iterate over supplied "items" in &lt;forEach&gt;
    Http请求中Content-Type讲解
    ftp实现文件上传(下载)
    解析html文档的java库及范例
    xslt循环转换子元素
    XPath学习:轴(1)——child
  • 原文地址:https://www.cnblogs.com/wangdayang/p/14914404.html
Copyright © 2020-2023  润新知