• 红楼梦 + 写入 MySQL + MongoDB


    MySQL

    import requests
    import re
    import pymysql
    from bs4 import BeautifulSoup
    
    conn = pymysql.Connect(host='127.0.0.1', user='root', password='123123', database='hlm')
    cursor = conn.cursor()
    
    url = 'http://www.purepen.com/hlm/'
    headers={
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
    }
    res = requests.get(url, headers=headers)
    res.encoding='gbk'
    soup = BeautifulSoup(res.text, 'lxml')
    
    for tr_list in soup.find_all(name='tr'):
        td_data = list(tr_list.find_all(name='td'))
        # url = tr_list.find_all(name='a').href
        if len(td_data) == 4:
            section1 = td_data[0].text
            title1 = td_data[1].text
            url1 = str(td_data[1])
            url1 = 'http://www.purepen.com/hlm/' + re.findall('<td><a href="(.*?)">', url1)[0]
    
            section2 = td_data[2].text
            title2 = td_data[3].text
            url2 = str(td_data[3])
            url2 = 'http://www.purepen.com/hlm/' + re.findall('<td><a href="(.*?)">', url2)[0]
            print(section1, title2,url1,'
    ',
                  section2, title2,url2 )
    
            # content 是表名
            sql = "insert into content (section,title,url) values ('%s','%s','%s')"%(section1,title1,url1)
            sql2 = "insert into content (section,title,url) values ('%s','%s','%s')"%(section2,title2,url2)
            cursor.execute(sql)
            cursor.execute(sql2)
    
            # 记得提交
            conn.commit()
    
    cursor.close()
    conn.close()
    
    

    MongoDB

    import requests
    import re
    import pymysql
    from bs4 import BeautifulSoup
    from pymongo import MongoClient
    
    client = MongoClient('localhost', 27017)
    db = client['ljw']
    db = db.lj
    
    url = 'http://www.purepen.com/hlm/'
    headers={
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
    }
    res = requests.get(url, headers=headers)
    res.encoding='gbk'
    soup = BeautifulSoup(res.text, 'lxml')
    
    for tr_list in soup.find_all(name='tr'):
        td_data = list(tr_list.find_all(name='td'))
        # url = tr_list.find_all(name='a').href
        if len(td_data) == 4:
            section1 = td_data[0].text
            title1 = td_data[1].text
            url1 = str(td_data[1])
            url1 = 'http://www.purepen.com/hlm/' + re.findall('<td><a href="(.*?)">', url1)[0]
    
            section2 = td_data[2].text
            title2 = td_data[3].text
            url2 = str(td_data[3])
            url2 = 'http://www.purepen.com/hlm/' + re.findall('<td><a href="(.*?)">', url2)[0]
            print(section1, title2,url1,'
    ',
                  section2, title2,url2 )
            res = db.insert_one({'章节': section1, '标题': title1, '网址': url1})
            print(res)
    
    
    
  • 相关阅读:
    MyISAM 和 InnoDB 索引的区别
    iOS crash日志
    。。。
    redis的缓存测试
    job测试
    笔记
    Android获取启动页面Activity方法
    UI自动化框架-一个小demo
    mitmproxy-java 的尝试
    monkey
  • 原文地址:https://www.cnblogs.com/kai-/p/12662845.html
Copyright © 2020-2023  润新知