• Python爬取微信公众号素材库


         这是我的之前写的代码,今天发布到博客园上,说不定以后需要用。

         开始:

        

    #coding:utf-8
    import werobot
    import pymongo
     
    class Gongzhonghao():
     
        def __init__(self,token,APP_ID,ENCODING_AES_KEY,APP_SECRET):
            self.robot = werobot.WeRoBot(token = token)
            self.robot.config['HOST'] = '0.0.0.0'
            self.robot.config['PORT'] = 80
            self.robot.config['APP_ID'] = APP_ID
            self.robot.config['ENCODING_AES_KEY'] = ENCODING_AES_KEY
            self.robot.config['APP_SECRET'] = APP_SECRET
     
        def _getNews_Count(self):
            """
            获取公众号图文消息总数
            :return: Int
            """
            mediacount = self.robot.client.get_media_count()
            news_count = mediacount['news_count']
            return news_count
     
        def getNews(self):
            """
            获取公众号所有的图文内容
            :return: Json
            """
            i = 0
            items = []
            news_count = self._getNews_Count()
            while i < news_count:
                tempj = self.robot.client.get_media_list('news', i, 20)
                items  = tempj['item'] + items
                i = i + 20
            j = {
                'total_count': news_count,
                'items': items
            }
            return j
     
        def echo(self):
            """
            用于公众号后台初次配置的验证
            :return: null
            """
            self.robot.run()
     
    if __name__ == '__main__':
        g = Gongzhonghao('1', '2', '3','4') 
        j = g.getNews()
        client = pymongo.MongoClient('ip', 27017)
        db = client.gongzhonghao
        xxx= db.xxx
        xxx.insert(j)
    

      然后连接数据库进行解析,数据库中包含图文消息html代码等信息。

         

    # -*- coding:utf-8 -*-
     
    import os
    import urllib.parse
    from html.parser import HTMLParser
     
    import requests
    from bs4 import BeautifulSoup
    from pymongo import MongoClient
     
     
    class ContentHtmlParser(HTMLParser):
        """
        过滤html标签
        """
     
        def __init__(self):
            HTMLParser.__init__(self)
            self.text = ""
     
        def handle_data(self, data):
            self.text += data
     
        def get_text(self):
            return self.text
     
     
    mongo_client = MongoClient("ip", 27017)
    mongo_db = mongo_client["gongzhonghao"]
     
     
    def get_words():
        words = []
        with open("words.txt", encoding="utf-8") as words_file:
            for lines in words_file.readlines():
                if len(lines.strip()) == 0:
                    continue
     
                if lines.find("、") != -1:
                    for p in lines.split("、"):
                        words.append(p.replace("
    ", ""))
                else:
                    words.append(lines.replace("
    ", ""))
        return words
     
     
    def get_articles(clt):
        articles = []
     
        collection = mongo_db[clt]
        doc = collection.find_one()
        items = doc["items"]
        for it in items:
            content = it["content"]["news_item"][0]
            articles.append(content)
     
        return articles
     
     
    def download(dir, file_name, url):
        if not os.path.exists(dir):
            os.mkdir(dir)
     
        try:
            resp = requests.get(url)
     
            path = dir + "\" + file_name
     
            if os.path.exists(path):
                return
     
            with open(path, "wb") as f:
                f.write(resp.content)
        except :
            print(url)
     
    def find_images(content):
        imgs = []
        c = urllib.parse.unquote(content)
        img_labels = BeautifulSoup(c, "html.parser").find_all("img")
        for img in img_labels:
            src = img.get("data-src")
            imgs.append(src)
        return imgs
     
     
    def get_suffix(url):
        try:
            suffix = url[url.rindex("=") + 1:]
            if suffix == "jpeg" or suffix == "other":
                return ".jpg"
            return "." + suffix
        except:
            return ".jpg"
     
     
    def filter_content(content):
        parser = ContentHtmlParser()
        parser.feed(content)
        return parser.get_text()
     
     
    def check_jinyongci(content):
        fc = filter_content(content)
        words = get_words()
        invalids = []
        for w in words:
            if fc.find(w) != -1:
                invalids.append(w)
        return invalids
     
     
    def save_jinyongci(clt, title, invalids):
        if len(invalids) == 0:
            return
     
        file = clt + "\invalid.txt"
     
        with open(file, "a+",encoding="utf-8") as f:
            f.write("标题:" + title)
            f.write("
    敏感词:")
     
            for iv in invalids:
                f.write(iv)
                f.write("、")
     
            f.write("
    
    ")
     
     
    if __name__ == "__main__":
        clt = "xxx"
     
        if not os.path.exists(clt):
            os.mkdir(clt)
     
        articles = get_articles(clt)
        print(clt + ": 共" + str(len(articles)) + "个")
     
        for i in range(0, len(articles)):
            print("正在处理第 " + str(i) + " 个")
     
            title = articles[i]["title"]
            thumb_url = articles[i]["thumb_url"]
            content = articles[i]["content"]
     
            # 下载封面
            # path = os.path.join(clt, title)
            fname = str(i) + "_" + title.replace("|", "").replace("<", "").replace(">", "")
            download(clt, fname + get_suffix(thumb_url), thumb_url)
     
            # 找出文章中的图片
            imgs = find_images(content)
            index = 0
            for img in imgs:
                download(clt, fname + "_" + str(index) + get_suffix(img), img)
                index = index + 1
     
            # 找出文章中的敏感词
            invalids = check_jinyongci(content)
            print(invalids,'----',title)
            save_jinyongci(clt, title, invalids)
    

      附带极限词列表,进行过滤使用

          

    最大程度、最高级、最高端、最奢侈、最低级、最便宜、史上最低价、最流行、最受欢迎、最先进科学、最新技术、最新科学
     
    中国第一、全网第一、销量第一、排名第一、第一品牌、NO.1、TOP1、独一无二、全国第一、最后一波、大品牌之一、销冠
     
    国家级、国际级、世界级、千万级、百万级、星级、5A、甲级、超甲级
     
    顶级、尖端、顶尖、顶级享受、完美、至尊、空前、绝后、绝版、非此莫属、巅峰、前所未有、完美、翘楚之作、不可再生、不可复制、绝无仅有、寸土寸金、淋漓尽致、无与伦比、唯一、卓越
     
    前无古人后无来者、绝版、珍稀、臻稀、稀少、绝无仅有、绝不在有、稀世珍宝、千金难求、世所罕见、不可多得、空前绝后、寥寥无几、屈指可数
     
    独家、独创、独据、开发者、缔造者、创始者、发明者
     
    首个、首选、独家、首发、首席、首府、首选、首屈一指、全国首家、国家领导人、国门、国宅、首次、填补国内空白、国际品质
     
    大牌、金牌、名牌、王牌、领先上市、巨星、著名、掌门人、至尊、冠军
     
    世界领先、领先、领导者、领袖、引领、创领、领航、耀领
      
    史无前例、前无古人、永久、万能、百分之百
    

      

        

  • 相关阅读:
    ElasticSearch(7.13.1) 作为服务启动(Windows)
    ElasticSearch(7.13.1) 安装与命令行启动
    Spring Boot 监听器 通过Session监听在线人数
    layui 弹出层icon
    Spring Boot 定时器 系统时间测试
    Spring Boot 拦截器
    Tomcat部署
    SpringBoot学习之整合Swagger
    浅谈Mybatis持久化框架在Spring、SSM、SpringBoot整合的演进及简化过程
    SpringBoot学习之整合Druid的简单应用
  • 原文地址:https://www.cnblogs.com/68xi/p/9294332.html
Copyright © 2020-2023  润新知