• 多线程爬取笔趣阁免费小说全站爬取


    import threading,os,time,requests,pymongo,re
    from queue import Queue
    from lxml import etree
    from bs4 import BeautifulSoup as BP
    client = pymongo.MongoClient(host='localhost',port=27017)
    mg = client['biquge']
    def get_fenlei():
    """
    爬取图书全部分类
    :return:
    """
    collection = mg['biquge_info']
    url = 'http://www.xxbqg5200.com/'
    sp = requests.get(url=url,headers = headers,cookies=cookies)
    soup = BP(sp.text,'lxml')
    fenlei_url = soup.select('#wrapper > div.nav > ul > li > a')
    list1 = []
    for i in fenlei_url:
    href_url = i['href']
    fenlei_name = i.get_text()
    if ''.join(href_url).split('/')[1] != 'sort':
    continue
    else:
    fenlei_href_url = '{}'.format('http://www.xxbqg5200.com')+str(href_url)
    list1.append(fenlei_href_url)
    try:
    date = {'fenlei_name':fenlei_name,'fenlei_url':fenlei_href_url}
    collection.insert(date)
    print('{}{}'.format(fenlei_name,'》》》》》存储成功'))
    except :
    print('{}'.format(fenlei_name,'存储失败'))
    return list1
    def get_page():
    """
    爬取全部分类的分页链接
    :return:
    """
    list1_url = get_fenlei()
    list_page = []
    a = 0
    for i in list1_url:
    a+=1
    page_url = ''.join(i).split('/')[3]
    page_href_url = '{}'.format('http://www.xxbqg5200.com/')+str(page_url)+str('/')+str(a)+str('_')
    for page in range(1,190):
    page_url = "{}".format(page_href_url)+str(page)+str('/')
    list_page.append(page_url)
    return list_page
    def get_tushu_url():
    """
    爬取所有图书的链接
    :return:
    """
    global q,lock
    while not q.empty():
    lock.acquire()
    url = q.get()
    lock.release()
    print(url,'###################################')
    collection = mg['biquge_info']
    list1 = []
    sp = requests.get(url=url,headers=headers,cookies=cookies)
    soup = BP(sp.text,'lxml')
    tushu_url = soup.select('#newscontent > div.l > ul > li > span.s2 > a')
    if tushu_url:
    for tushu_href_url in tushu_url:
    tushu_name_url = tushu_href_url['href']
    tushu_name = tushu_href_url.get_text()
    list1.append(tushu_name_url)
    try:
    date = {'tushu_name':tushu_name,'tushu_name_url':tushu_name_url}
    collection.insert(date)
    print('{}{}'.format(tushu_name, '》》》》》存储成功'))
    except :
    print('{}'.format(tushu_name,'存储失败'))
    else:
    pass
    """
    爬取章节所有链接
    """
    list2 = []
    for zhang_url in list1:
    response = requests.get(zhang_url,headers=headers,cookies=cookies)
    soup_zhang = BP(response.text,'lxml')
    zhangjie_url = soup_zhang.select('#list > dl > dd > a')
    for zhang_href in zhangjie_url:
    zhangjie_href = zhang_href['href']
    zhangjie_name = zhang_href.get_text()
    # print(zhangjie_name,')))))))))))',zhangjie_href)
    content_url = '{}'.format('http://www.xxbqg5200.com')+str(zhangjie_href)
    list2.append(content_url)
    try:
    date_zhangjie = {'zhangjie_name':zhangjie_name,'zhangjie_href':zhangjie_href}
    collection.insert(date_zhangjie)
    print('{}{}'.format(zhangjie_name, '》》》》》存储成功'))
    except :
    print('{}'.format(zhangjie_name,'存储失败'))
    """
    爬取章节下的所有内容
    """
    content_sql = mg['tushu_content']
    for content_list_url in list2:
    response1 = requests.get(content_list_url,headers=headers,cookies=cookies)
    soup_content = BP(response1.text,'lxml')
    content_nei = soup_content.select('#content')
    for text_content in content_nei:
    filter_content = re.findall('[u4e00-u9fa5a-zA-Z0-9]+', text_content, re.S) # 只要字符串中的中文,字母,数字
    filter_text_content = "".join(filter_content)
    try:
    date_content = {'content':filter_text_content}
    content_sql.insert(date_content)
    print('{}'.format( '》》》》》存储成功'))
    except :
    print('{}'.format('存储失败'))
    if __name__ == '__main__':
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    'Referer': 'http://www.xxbqg5200.com/register.php?do=submit',

    }
    cookies = {
    'Cookie': 'Hm_lvt_bbb2110ecd75330bec79c7868b24e681=1575524043; PHPSESSID=03pt092b5nb8qsdl6pk425kh87; jieqiUserInfo=jieqiUserId%3D1912%2CjieqiUserName%3Dduanyibo%2CjieqiUserGroup%3D3%2CjieqiUserName_un%3Dduanyibo%2CjieqiUserLogin%3D1575524132; jieqiVisitInfo=jieqiUserLogin%3D1575524132%2CjieqiUserId%3D1912; Hm_lpvt_bbb2110ecd75330bec79c7868b24e681=1575524140',

    }
    q = Queue()
    lock = threading.Lock()
    list_url = get_page()
    for i in list_url:
    q.put(i)
    for i in range(10):
    t = threading.Thread(target=get_tushu_url)
    t.start()
  • 相关阅读:
    一种针对SOA的消息类型架构
    许可方式 到底"非商业用途"意味着什么?
    Windows 7的CMD中 Telnet 无法执行的解决办法
    ASP.NET MVC 2.0 中文正式版发布
    什么是REST?
    架构、框架的区别
    Firefox 火狐下自动刷新的插件 ReloadEvery
    ASP.NET与JQUERY的AJAX文件上传 视频课件+源码Demo
    给吸烟的园友们:一个被烟草行业隐瞒了十年的秘密,烟真不是人吸的
    Echo Server,AsyncSocket,SocketAsyncEvent,SocketAsyncEventArgs,AsyncQueue
  • 原文地址:https://www.cnblogs.com/duanlinxiao/p/11993911.html
Copyright © 2020-2023  润新知