最近直播答题app很热门,由于之前看过跳一跳的python脚本(非常棒),于是也想写一个答题的脚本。
https://github.com/huanmsf/cai
思路:
1、截图
2、文字识别,提取问题和选项(分割后识别准确性会提高)
3、爬取网页数据,根据规则匹配选项
4、根据选项自动点击屏幕该位置(应该循环点击,防止刚好切换到西瓜妹)
5、重复前面步骤
存在的问题:
1、答题时间有限,如果爬去的链接多了,还没解析完时间就到了。爬取的少就缺少分析数据,结果不靠谱。
2、问题和选项需要提取关键字匹配
3、可能要试试其他搜索引擎(百度垃圾信息严重影响正确率)
目录:
├── baidu.py ├── cai.png ├── main.py ├── need │ └── chi_sim.traineddata ├── README └── screenshot.py
main.py:
from screenshot import pull_screenshot import time, urllib.request, baidu, os try: import Image except ImportError: from PIL import Image, ImageDraw import pytesseract # 屏幕顶端到问题的距离/屏幕高度,随分辨率变化(默认1920*1080) top_off_c = 0.15 # 问题高度 que_h = 300 # 答案高度 ans_h = 170 # 左右偏移量 l_r_off = 40 # 问题过滤器 que_filter = ['.', ' '] # 答案过滤器 ans_filter = ["《", "》", ' '] # 问题列表 que_list = [] # 选项坐标 point_A = (0, 0, 0, 0) point_B = (0, 0, 0, 0) point_C = (0, 0, 0, 0) # 辅助找到文字区域 def draw(): img = Image.open('cai.png') w, h = img.size draw = ImageDraw.Draw(img) draw.line((40, h * 0.15, w - 40, h * 0.15), fill="red") draw.line((40, h * 0.15 + 300, w - 40, h * 0.15 + 300), fill="red") draw.line((40, h * 0.15 + 470, w * 0.7, h * 0.15 + 470), fill="red") draw.line((40, h * 0.15 + 640, w * 0.7, h * 0.15 + 640), fill="red") draw.line((40, h * 0.15 + 810, w * 0.7, h * 0.15 + 810), fill="red") img.show() def click(point): # img = Image.open('cai.png') # w, h = img.size # draw = ImageDraw.Draw(img) # draw.arc(point, 0, 360, fill="red") # img.show() cmd = 'adb shell input swipe {x1} {y1} {x2} {y2} {duration}'.format( x1=point[0], y1=point[1], x2=point[2], y2=point[3], duration=1 ) os.system(cmd) def main(): while True: print(">>>>>>") pull_screenshot() img = Image.open('cai.png') img = img.convert('L') w, h = img.size img_q = img.crop((l_r_off, h * top_off_c, w - l_r_off, h * top_off_c + que_h)) img_a = img.crop((l_r_off, h * top_off_c + que_h, w * 0.7, h * top_off_c + que_h + ans_h)) img_b = img.crop((l_r_off, h * top_off_c + que_h + ans_h, w * 0.7, h * top_off_c + que_h + ans_h * 2)) img_c = img.crop((l_r_off, h * top_off_c + que_h + ans_h * 2, w * 0.7, h * top_off_c + que_h + ans_h * 3)) point_A = (w / 3 - 20, h * top_off_c + que_h + ans_h / 2 - 20, w / 3, h * top_off_c + que_h + ans_h / 2) point_B = (w / 3 - 20, h * top_off_c + que_h + ans_h / 2 * 3 - 20, w / 3, h * top_off_c + que_h + ans_h / 2 * 3) point_C = (w / 3 - 20, h * top_off_c + que_h + ans_h / 2 * 5 - 20, w / 3, h * top_off_c + que_h + ans_h / 2 * 5) # need 下的chi文件 复制到/usr/share/tesseract-ocr/4.00/ question = pytesseract.image_to_string(img_q, lang='chi_sim') ans_a = pytesseract.image_to_string(img_a, lang='chi_sim') ans_b = pytesseract.image_to_string(img_b, lang='chi_sim') ans_c = pytesseract.image_to_string(img_c, lang='chi_sim') ans = ["1", "1", "1"] for f in que_filter: question = question.strip().replace(f, "") for f in ans_filter: ans_a = ans_a.strip().replace(f, "") ans_b = ans_b.strip().replace(f, "") ans_c = ans_c.strip().replace(f, "") ans[0] = ans_a ans[1] = ans_b ans[2] = ans_c for a in ans: if not a.strip(): ind = ans.index(a) ans[ind] = "&*&" print(question) print(ans) if que_list.__contains__(question): continue index = baidu.search(question, ans) # 选第1,2,3个 if index == 0: click(point_A) elif index == 1: click(point_B) else: click(point_C) print("index" + str(index)) que_list.append(question) if __name__ == '__main__': main()
baidu.py:
# -*- coding:utf-8 -*- import urllib, time, re import lxml.etree as etree # 答案积分规则 """ 某个答案首次出现在一篇文章中+10,再次+3 """ def search(question, ans): cont = {} q_url = "http://www.baidu.com/s?word=" + urllib.parse.quote(question) top_page = getdata(q_url) selector = etree.HTML(top_page) url_list = selector.xpath('//h3[@class]/a[@data-click]/@href')[0:5] for url_item in url_list: if not url_item.startswith('http'): continue print(url_item) sub_page = getdata(url_item) selector = etree.HTML(sub_page) try: content_list = selector.xpath('//div/text()|//span/text()|//p/text()') except: return 0 ans_tmp_list = [] for con in content_list: if con.strip(): for a in ans: if a in con: if ans_tmp_list.__contains__(a): if a in cont.keys(): cont[a] += 3 else: cont[a] = 3 else: if a in cont.keys(): cont[a] += 10 else: cont[a] = 10 ans_tmp_list.append(a) print(con) print(cont) if not cont: return 0 else: l = sorted(cont.items(), key=lambda x: x[1], reverse=True) return ans.index(l[0][0]) def getdata(url): req = urllib.request.Request(url) try: response = urllib.request.urlopen(req) except: return " " top_page = "" try: top_page = response.read().decode("utf-8", 'ignore') except: pass # print(top_page) return top_page
screenshot.py:
# -*- coding: utf-8 -*- """ 手机屏幕截图的代码(参考跳一跳外挂源码) """ import subprocess import os import sys from PIL import Image SCREENSHOT_WAY = 3 def pull_screenshot(): global SCREENSHOT_WAY if 1 <= SCREENSHOT_WAY <= 3: process = subprocess.Popen( 'adb shell screencap -p', shell=True, stdout=subprocess.PIPE) binary_screenshot = process.stdout.read() if SCREENSHOT_WAY == 2: binary_screenshot = binary_screenshot.replace(b' ', b' ') elif SCREENSHOT_WAY == 1: binary_screenshot = binary_screenshot.replace(b' ', b' ') f = open('cai.png', 'wb') f.write(binary_screenshot) f.close() elif SCREENSHOT_WAY == 0: os.system('adb shell screencap -p /sdcard/cai.png') os.system('adb pull /sdcard/cai.png .')
文字识别 sudo pip3 install pytesseract sudo apt-get install tesseract-ocr
初级版本效果:
题外话:
最近在浏览FB站看到
文中提到可以提前10秒得到题目(不知是否属实),由于访问权限不能看,如有知道怎么搞的请留言交流下,谢谢