2019/6/27号————检查

-----------------------------------------------------spider_Un.py---------------------------------------------------------------------------------------------------
import requests
import time
from lxml import etree

def get_html(url):  # 请求页面
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
        res = requests.get(url, headers = headers)
        res.encoding = res.apparent_encoding
        if res.status_code == 200:
            html = res.text
            return html
        else:
            time.sleep(0.1)
            return get_html(url)
    except Exception as e:  # except BaseException  这个也可以     e是打印出错误的原因
        print("问题是", e)
        pass

def parse(html):
    print(html)
    r = etree.HTML(html)
    #print(r)
    node_list = r.xpath("//div[@class='container']//script/text()")[1]
    print(node_list)
    print(len(node_list))

def url_join():

    url_start = 'https://gaokao.chsi.com.cn/sch/search--ss-on,searchType-1,option-qg,start-'
    url_end = '.dhtml'
    url_list = []
    for i in range(1,139):
        url_num = 20 * i - 20
        url = url_start + str(url_num) + url_end
        url_list.append(url)

    return url_list
if __name__ == '__main__':
    # url_list = url_join()
    # print(url_list)
    #
    # for url in url_list:
    #
    #     #访问
    #     html = get_html(url)

    url = 'https://gaokao.chsi.com.cn/sch/search--ss-on,searchType-1,option-qg,start-0.dhtml'
    html = get_html(url)
    parse(html)
--------------------------------------------------------------------------------------------------------------------------------weixin.py---------------------------------------

#Weichat

import scrapy
import urllib.parse
from news_project.middlewares import Deal_Content
from news_project.items import NewsProjectItem
from lxml import etree
import js2py
import time
import requests
import re
import bs4

class Weichat(scrapy.Spider):
    name = 'Weichat'
    base = 'https://mp.weixin.qq.com'
    allowed_domains = ['weixin.sogou.com']    #允许的页面最好不要定义  http://  这样的
    start_urls = ['http://weixin.sogou.com']

    #微信
    def parse(self,response):

        url_1 = 'https://weixin.sogou.com/weixin?type=1&query={}&ie=utf8&s_from=input&_sug_=y&_sug_type_='.format('宝鸡招商局')
        yield scrapy.Request(url=url_1, callback=self.detail_parse,dont_filter=True)

    def detail_parse(self, response):

        baoji_url = response.xpath("//a[@uigs='account_name_0']/@href").extract_first()
        print('baoji_url',baoji_url)
        print('baoji_url', type(baoji_url))
        yield scrapy.Request(url=baoji_url, callback=self.baoji_parse, dont_filter=True)

    def baoji_parse(self,response):

        selector = etree.HTML(response.text)
        print("------------------宝鸡招商局---------------------",response.text)
        script = selector.xpath('.//script[not(@nonce) and @type="text/javascript"]/text()')
        script = script[1]
        script = script.replace('seajs.use("sougou/profile.js");', '')
        script = 'function getList(){' + script + 'return  msgList  
}'
        script = script.replace('amp;', '')
        getList = js2py.eval_js(script)
        js = getList()
        js = str(js)
        js = eval(js)
        lis = js.get('list')
        firstLinks = []
        otherStyleTimes = []
        for li in lis:
            # 获取文章发布时间，转换为时间戳，格式化为数据库可以保存的格式
            datimes = li['comm_msg_info']['datetime']
            timeArray = time.localtime(datimes)
            otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)

            # 找到文章url，后期构建真实url时用到
            try:
                content_url = li.get('app_msg_ext_info').get('content_url')
                print(content_url)
                firstLink = self.base + content_url
            except IndexError:
                firstLink = None
                print('CAPTCHA!')
            firstLinks.append(firstLink)
            otherStyleTimes.append(otherStyleTime)
        print('firstLinks, otherStyleTimes***********************',firstLinks, otherStyleTimes)
        yield scrapy.Request(url=firstLinks[0], callback=self.baoji_detail_parse, meta={'time':otherStyleTimes[0]},dont_filter=True)

    def baoji_detail_parse(self,response):

        item = NewsProjectItem()
        content = ''
        meta = response.meta
        print("response.url",response.url)
        #res.raise_for_status()
        detailPage = bs4.BeautifulSoup(response.text, "html.parser")
        # 获取文章标题
        title = detailPage.title.text.replace('
', '').replace('
', '').replace(' ', '').replace('！', '').replace('|', '')
        print('title',title)

        sections = detailPage.findAll('section', class_='_editor')
        # 获取文章内容
        for section in sections[:-3]:
            content = content + section.text.replace('
', '').replace('
', '').replace(' ', '')

        content_1 = response.xpath("//div[@id='js_content']")
        print("meta['time']**************",meta['time'])

        print("content---------------",content)

        item['title_url'] = response.url

        # 详细页面的内容
        etree = response.xpath('//div[@id="js_content"]')
        tagContet = etree.extract()
        tagContet = ''.join(tagContet)

        content = etree.xpath('.//text()').extract()
        content = ''.join(content)
        img_urls = etree.xpath('.//img/@src').extract()

        img_urls_dict = {}
        for url in img_urls:
            if "http://网站" not in url:
                url1 = urllib.parse.urljoin(response.url, url)  # 拼接url的网址
                img_urls_dict[url] = url1

        print("*******img_urls_dict****", img_urls_dict)
        item['content'], item['tags'] = Deal_Content.handleText(content, tagContet, img_urls_dict, title)
        print("************item['tags']********************", item['tags'])


        item['content'] = item['content'].replace(re.findall("font-size: 18px;(.*)",item['content'])[0],'')

        item['title'] = title

        item['time'] = meta['time']

        #item['content'] = content

        id, pid = Deal_Content.sql_read(response.url)

        item['id'] =id

        item['pid'] = pid

        item['type_cn'] = "省市级"

        # #news    新闻来源、是那一个网站， 主页
        item['news'] = '宝鸡招商局'

        # type_no 就是 id
        item['type_no'] = 18

        yield item
----------------------------------------------------------js.py-----------------------------------------------------------------------------------------

相关阅读:
markdown grammar
vs the address of offline package
步步为营-66-Socket通信
 步步为营-65-线程小例子
 线程使用中常见的错误-“System.InvalidOperationException”线程间操作无效: 从不是创建控件“ ”的线程访问它。
步步为营-64-进程&线程
 步步为营-63-Asp.net-get与post
步步为营-62-Excel的导入和导出
 步步为营-61-拼音组件
 步步为营-60-代码生成器
原文地址：https://www.cnblogs.com/yuanjia8888/p/11099010.html