使用网页手机模式进行爬取,这样就可以避开翻页需要登录的问题
1.将存入数据库进行封装
""" CREATE TABLE weibo_data( id int primary key auto_increment, create_at varchar(30), reposts_count int, comments_count int, attitudes_count int, `text` text) default charset=utf8mb4; """ import pymysql class WeiboMysql(object): # 初始化就是连接数据库 def __init__(self): self.conn = pymysql.connect(host='127.0.0.1', user='root', passwd='510520', db='pachong', charset='utf8mb4') self.cursor = self.conn.cursor() def execute_insert_sql(self, sql, data): self.cursor.execute(sql, data) self.conn.commit() def __del__(self): self.cursor.close() self.conn.close() if __name__ == '__main__': weibo = WeiboMysql() insert_sql = "INSERT INTO weibo_data(create_at, reposts_count, comments_count, attitudes_count, `text`) VALUES(%s, %s, %s, %s, %s)" data = ('12-18', '123', '123', '123', '画画baby') weibo.execute_insert_sql(insert_sql, data)
2.获取数据并存入数据库
import requests import json # 导入remove_tags除去标签 from w3lib.html import remove_tags # 导入自定义的WeiboMysql类 from weibo.weibo_mysql import WeiboMysql weibo = WeiboMysql() # 伪装成浏览器 headers = { 'cookie': '_T_WM=99370732608; XSRF-TOKEN=6b3c2d; WEIBOCN_FROM=1110005030; MLOGIN=0; M_WEIBOCN_PARAMS=oid%3D4561830166403683%26lfid%3D102803%26luicode%3D20000174%26fid%3D1005051211441627%26uicode%3D10000011', 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Mobile Safari/537.36', 'referer': 'https://m.weibo.cn/u/1211441627', } def get_page_data(url): response = requests.get(url, headers=headers) # 将获取到的数据转换为json格式 res_dict = json.loads(response.text) cards_list = res_dict['data']['cards'] insert_sql = "INSERT INTO weibo_data(create_at, reposts_count, comments_count, attitudes_count, `text`) VALUES(%s, %s, %s, %s, %s)" for card in cards_list: if 'mblog' in card: text = remove_tags(card['mblog']['text']) created_at = card['mblog']['created_at'] reposts_count = card['mblog']['reposts_count'] comments_count = card['mblog']['comments_count'] attitudes_count = card['mblog']['attitudes_count'] data = (created_at, reposts_count, comments_count, attitudes_count, text) print(data) # 存入数据库 weibo.execute_insert_sql(insert_sql, data) # 返回当前url下的since_id,这是下一个url携带的参数since_id since_id = res_dict['data']['cardlistInfo']['since_id'] return since_id def main(): url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=1211441627&containerid=1076031211441627' since_id = get_page_data(url) # 循环进行爬取 while True: try: next_url = url + '&since_id=' + str(since_id) print(next_url) since_id = get_page_data(next_url) except: break if __name__ == '__main__': main()