Python 利用Python编写简单网络爬虫实例3

Python 利用Python编写简单网络爬虫实例3

利用Python编写简单网络爬虫实例3

by:授客 QQ：1033553122

实验环境

python版本：3.3.5（2.7下报错

实验目的

获取目标网站“http://bbs.51testing.com/forum.php”中特定url，通过分析发现，目标url同其它url的关系如下

目标url存在子页面中的文章中，随机分布，我们要把它找出来

python脚本

#!/usr/bin/env python

# -*- coding:utf-8 -*-

from urllib.request import *

import gzip, re

from io import BytesIO

from html.parser import HTMLParser

# 爬虫类

class Reptile:

    """to download web pages"""

    def __init__(self):

        self.url_set = set() # 用于存储已下载过的页面url

        self.data = ""



    # 下载页面

    def get_page(self, url, headers):

        request = Request(url, headers=headers)

        request.add_header('Accept-encoding', 'gzip') #下载经过gzip方式压缩后的网页，减少网络流量

        try:

            response = urlopen(request) # 发送请求报文



            if response.code == 200: # 请求成功

                page = response.read() # 读取经压缩后的页面



                if response.info().get("Content-Encoding") == "gzip":

                    page_data = BytesIO(page)

                    gzipper = gzip.GzipFile(fileobj = page_data)

                    self.data = gzipper.read()

                else:

                    print("gzip unused")

                    self.data = page_data # 网页未采用gzip方式压缩，使用原页面

        except Exception:

            pass



        self.url_set.add(url)



        return self.data

    # 获取论坛目标版块url

    def get_forum_url(self, url_set, home, include):

        forum_url_set = set() # 用于存放版块url

        while len(url_set) > 0:

            url = url_set.pop()

            if re.findall(include, url):

                # 读取的版块url通常是forum-53-1.html形势的

                url = home + url

                forum_url_set.add(url)

        return forum_url_set



    # 获取版块url下的帖子url

    def get_title_url(self, url_set, home, include):

        title_url_set = set() # 用于存放帖子url

        while len(url_set) > 0:

            url = url_set.pop()

            if re.findall(include, url):

                # 读取的帖子url通常是thread-1044711-1-1.html形式的

                url = home + url

                title_url_set.add(url)

        return title_url_set



# 解析器类

class MyHtmlParser(HTMLParser):

    def reset(self):

        HTMLParser.reset(self) # 注意顺序

        self.url_set = set()



    def handle_starttag(self, tag, attrs):

        #self.url = []

        url_list = [value for key, value in attrs if "href" == key]

        if url_list:

            for url in url_list:

                self.url_set.add(url)





##############测试################

# 添加头域，伪装浏览器访问网站,防止一些网站拒绝爬虫访问

headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0"}

init_url = "http://bbs.51testing.com/forum.php"

# 构造解析器

parser = MyHtmlParser(strict = False)

# 下载网页

page_number = 1

print("program is downloading the frist url page")

reptile = Reptile()

page = reptile.get_page(init_url, headers)

print("processing the %dth url page" % page_number)

# 解析网页(获取url)

parser.feed(str(page))

# 获取分类版块url

home = "http://bbs.51testing.com/"

# 分成多个变量来写，主要是为了书写方便，排版友好

pattern1 = "forum-122-[1-9]|forum-243-[1-9]|forum-40-[1-9]|forum-63-[1-9]"

pattern2 = "|forum-42-[1-9]|forum-53-[1-9]|forum-275-[1-9]|forum-140-[-9]"

pattern3 = "|forum-138-[1-9]|forum-139-[1-9]|forum-141-[1-9]"

pattern = pattern1 + pattern2 + pattern3

include = re.compile(pattern)

forum_url_set = reptile.get_forum_url(parser.url_set, home, include)

# 循环，获取每个分类版块下，1-10子版块的url(前10页)(翻页页面)

result_url_set = set()

forum_index = 1

for forum_url in forum_url_set:

    page = reptile.get_page(forum_url, headers)

    parser.feed(str(page))



    print("getting the board urls in the %dth forum page" % forum_index)

    tmp_url_set = reptile.get_forum_url(parser.url_set, home, include)

    forum_index = forum_index + 1



    result_url_set = result_url_set ^ tmp_url_set

title_url_set = set()

forum_index = 1

title_index = 1

for forum_url in result_url_set:

    page = reptile.get_page(forum_url, headers)

    parser.feed(str(page))



    # 获取版块下的帖子url

    pattern1 = "thread-[0-9]{7}-[0-9]{1}-[0-9]{1}[.]html|"

    pattern2 = "thread-[0-9]{6}-[0-9]{1}-[0-9]{1}[.]html|"

    pattern3 = "thread-[0-9]{7}-[0-9]{1}-[0-9]{2}[.]html|"

    pattern4 = "thread-[0-9]{6}-[0-9]{1}-[0-9]{2}[.]html"

    pattern = pattern1 + pattern2 + pattern3 + pattern4

    include = re.compile(pattern)



    print("getting all title urls in the %dth forum board" % forum_index)

    tmp_url_set = reptile.get_title_url(parser.url_set, home, include)

    forum_index = forum_index + 1



    title_url_set = title_url_set ^ tmp_url_set



# 获取目标url

target_index = 1

title_index = 1

filepath = "d:/url.txt"

for title_url in title_url_set:

    print("processing the %dth title url" % title_index)

    page = reptile.get_page(title_url, headers)

    parser.feed(str(page))



    # 保存目标url

    with open(filepath, "a") as f:

        while len(parser.url_set) > 0:

            url = parser.url_set.pop()

            pattern = "http://bbs.51testing.com/treasure/treasure.php[?]trenum=[0-9]{5}"

            include = re.compile(pattern)

            flag = re.findall(include, url)

            if flag:

                print("find target! saving the %dth target url in the %dth title page" % (target_index, title_index))

                f.write("the %dth url: %s" % (target_index, url))

                target_index = target_index + 1

                f.write(" ")

    title_index = title_index + 1

print("complete")

结果：

声明：仅供学习研究使用，请勿用于其它非法用途
相关阅读:
SGU 176.Flow construction （有上下界的最大流）
POJ 2391.Ombrophobic Bovines （最大流）
poj 1087.A Plug for UNIX （最大流）
poj 1273.PIG （最大流）
POJ 2112.Optimal Milking （最大流）
SGU 196.Matrix Multiplication
SGU 195. New Year Bonus Grant
关于multicycle path
ppt做gif动图
 codeforces 598A Tricky Sum
原文地址：https://www.cnblogs.com/shouke/p/10157942.html