• bokeyuan_python文章爬去入mongodb读取--LOWBIPROGRAMMER


    # -*- coding: utf-8 -*-
    import requests,os
    from lxml import etree
    from pymongo import *

    class Boke(object):
    def __init__(self):
    self.url ="https://www.cnblogs.com/cate/python/"
    self.headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331'}
    def get_data(self,url):
    response = requests.get(url,headers=self.headers)
    return response.content
    def xml_data(self,data):
    html = etree.HTML(data)
    mes = html.xpath("//div[@class='post_item']")
    for i in mes:
    dict={}
    info_url = i.xpath("./div[@class='post_item_body']/h3/a/@href")[0]
    self.info_data(info_url)
    dict['url'] = info_url
    self.write_dbs(dict)


    def info_data(self,data):
    path = "f:/woc/"
    if not os.path.exists(path):
    os.makedirs(path)
    mes = self.get_data(data)
    html = etree.HTML(mes)
    list = html.xpath("//div[@id='topics']/div[@class='post']")
    # print(list)
    for x in list:
    dictlist = {}
    title = x.xpath("./h1[@class='postTitle']/a/text()")[0]
    info = x.xpath("./div[@class='postBody']//text()")
    dictlist['title'] = title
    dictlist['info'] = info
    self.write1_dbs(dictlist)

    def dbs(self):
    connect = MongoClient('127.0.0.1',27017)
    conn = connect['boke']
    conn1 =conn['zhu']
    conn2 =conn['info']
    return conn1,conn2
    def write_dbs(self,data):
    conn1,conn2 = self.dbs()
    conn1.insert_one(data)
    result=conn1.find()
    for i in result:
    print(i)
    def write1_dbs(self,data):
    conn1, conn2 = self.dbs()
    conn2.insert_one(data)
    result = conn2.find()
    for i in result:
    print(i)


    def run(self):
    url = self.url
    data = self.get_data(url)
    self.xml_data(data)
    if __name__ == '__main__':
    boke = Boke()
    boke.run()
  • 相关阅读:
    jstl 部分标签
    Maven pom.xml 元素配置说明(一)
    spring 参数绑定
    mysql 索引
    ArrayList和HashSet的Contains()方法(转)
    每日记载内容总结44
    剑指offer42:不用加减乘除做加法
    动态规划常见题型
    华为机试-统计每个月兔子的总数
    华为机试-字符串合并处理
  • 原文地址:https://www.cnblogs.com/xcsg/p/10138727.html
Copyright © 2020-2023  润新知