python答题辅助

最近直播答题app很热门，由于之前看过跳一跳的python脚本（非常棒），于是也想写一个答题的脚本。

https://github.com/huanmsf/cai

思路：

1、截图

2、文字识别，提取问题和选项（分割后识别准确性会提高）

3、爬取网页数据，根据规则匹配选项

4、根据选项自动点击屏幕该位置（应该循环点击，防止刚好切换到西瓜妹）

5、重复前面步骤

存在的问题：

1、答题时间有限，如果爬去的链接多了，还没解析完时间就到了。爬取的少就缺少分析数据，结果不靠谱。

2、问题和选项需要提取关键字匹配

3、可能要试试其他搜索引擎（百度垃圾信息严重影响正确率）

├── baidu.py
├── cai.png
├── main.py
├── need
│   └── chi_sim.traineddata
├── README
└── screenshot.py

main.py：

from screenshot import pull_screenshot
import time, urllib.request, baidu, os

try:
    import Image
except ImportError:
    from PIL import Image, ImageDraw

import pytesseract

# 屏幕顶端到问题的距离/屏幕高度，随分辨率变化（默认1920*1080）
top_off_c = 0.15
# 问题高度
que_h = 300
# 答案高度
ans_h = 170

# 左右偏移量
l_r_off = 40

# 问题过滤器
que_filter = ['.', ' ']

# 答案过滤器
ans_filter = ["《", "》", ' ']

# 问题列表
que_list = []

# 选项坐标
point_A = (0, 0, 0, 0)
point_B = (0, 0, 0, 0)
point_C = (0, 0, 0, 0)


# 辅助找到文字区域
def draw():
    img = Image.open('cai.png')
    w, h = img.size
    draw = ImageDraw.Draw(img)
    draw.line((40, h * 0.15, w - 40, h * 0.15), fill="red")
    draw.line((40, h * 0.15 + 300, w - 40, h * 0.15 + 300), fill="red")

    draw.line((40, h * 0.15 + 470, w * 0.7, h * 0.15 + 470), fill="red")
    draw.line((40, h * 0.15 + 640, w * 0.7, h * 0.15 + 640), fill="red")
    draw.line((40, h * 0.15 + 810, w * 0.7, h * 0.15 + 810), fill="red")

    img.show()


def click(point):
    # img = Image.open('cai.png')
    # w, h = img.size
    # draw = ImageDraw.Draw(img)
    # draw.arc(point, 0, 360, fill="red")
    # img.show()
    cmd = 'adb shell input swipe {x1} {y1} {x2} {y2} {duration}'.format(
        x1=point[0],
        y1=point[1],
        x2=point[2],
        y2=point[3],
        duration=1
    )
    os.system(cmd)


def main():
    while True:

        print(">>>>>>")
        pull_screenshot()
        img = Image.open('cai.png')
        img = img.convert('L')
        w, h = img.size
        img_q = img.crop((l_r_off, h * top_off_c, w - l_r_off, h * top_off_c + que_h))
        img_a = img.crop((l_r_off, h * top_off_c + que_h, w * 0.7, h * top_off_c + que_h + ans_h))
        img_b = img.crop((l_r_off, h * top_off_c + que_h + ans_h, w * 0.7, h * top_off_c + que_h + ans_h * 2))
        img_c = img.crop((l_r_off, h * top_off_c + que_h + ans_h * 2, w * 0.7, h * top_off_c + que_h + ans_h * 3))

        point_A = (w / 3 - 20, h * top_off_c + que_h + ans_h / 2 - 20, w / 3, h * top_off_c + que_h + ans_h / 2)
        point_B = (w / 3 - 20, h * top_off_c + que_h + ans_h / 2 * 3 - 20, w / 3, h * top_off_c + que_h + ans_h / 2 * 3)
        point_C = (w / 3 - 20, h * top_off_c + que_h + ans_h / 2 * 5 - 20, w / 3, h * top_off_c + que_h + ans_h / 2 * 5)

        # need 下的chi文件 复制到/usr/share/tesseract-ocr/4.00/
        question = pytesseract.image_to_string(img_q, lang='chi_sim')
        ans_a = pytesseract.image_to_string(img_a, lang='chi_sim')
        ans_b = pytesseract.image_to_string(img_b, lang='chi_sim')
        ans_c = pytesseract.image_to_string(img_c, lang='chi_sim')
        ans = ["1", "1", "1"]
        for f in que_filter:
            question = question.strip().replace(f, "")

        for f in ans_filter:
            ans_a = ans_a.strip().replace(f, "")
            ans_b = ans_b.strip().replace(f, "")
            ans_c = ans_c.strip().replace(f, "")

        ans[0] = ans_a
        ans[1] = ans_b
        ans[2] = ans_c

        for a in ans:
            if not a.strip():
                ind = ans.index(a)
                ans[ind] = "&*&"

        print(question)
        print(ans)

        if que_list.__contains__(question):
            continue

        index = baidu.search(question, ans)
        # 选第1,2,3个
        if index == 0:
            click(point_A)
        elif index == 1:
            click(point_B)
        else:
            click(point_C)

        print("index" + str(index))
        que_list.append(question)


if __name__ == '__main__':
    main()

baidu.py：

# -*- coding:utf-8 -*-

import urllib, time, re

import lxml.etree as etree

# 答案积分规则
"""
某个答案首次出现在一篇文章中+10，再次+3
"""


def search(question, ans):
    cont = {}
    q_url = "http://www.baidu.com/s?word=" + urllib.parse.quote(question)
    top_page = getdata(q_url)
    selector = etree.HTML(top_page)
    url_list = selector.xpath('//h3[@class]/a[@data-click]/@href')[0:5]
    for url_item in url_list:
        if not url_item.startswith('http'):
            continue
        print(url_item)
        sub_page = getdata(url_item)
        selector = etree.HTML(sub_page)
        try:
            content_list = selector.xpath('//div/text()|//span/text()|//p/text()')
        except:
            return 0
        ans_tmp_list = []
        for con in content_list:
            if con.strip():
                for a in ans:
                    if a in con:
                        if ans_tmp_list.__contains__(a):
                            if a in cont.keys():
                                cont[a] += 3
                            else:
                                cont[a] = 3
                        else:
                            if a in cont.keys():
                                cont[a] += 10
                            else:
                                cont[a] = 10
                            ans_tmp_list.append(a)

                print(con)

    print(cont)
    if not cont:
        return 0
    else:
        l = sorted(cont.items(), key=lambda x: x[1], reverse=True)
        return ans.index(l[0][0])


def getdata(url):
    req = urllib.request.Request(url)
    try:
        response = urllib.request.urlopen(req)
    except:
        return " "
    top_page = ""
    try:
        top_page = response.read().decode("utf-8", 'ignore')
    except:
        pass
    # print(top_page)
    return top_page

screenshot.py：

# -*- coding: utf-8 -*-
"""
手机屏幕截图的代码（参考跳一跳外挂源码）
"""
import subprocess
import os
import sys
from PIL import Image

SCREENSHOT_WAY = 3


def pull_screenshot():
    global SCREENSHOT_WAY
    if 1 <= SCREENSHOT_WAY <= 3:
        process = subprocess.Popen(
            'adb shell screencap -p',
            shell=True, stdout=subprocess.PIPE)
        binary_screenshot = process.stdout.read()
        if SCREENSHOT_WAY == 2:
            binary_screenshot = binary_screenshot.replace(b'
', b'
')
        elif SCREENSHOT_WAY == 1:
            binary_screenshot = binary_screenshot.replace(b'

', b'
')
        f = open('cai.png', 'wb')
        f.write(binary_screenshot)
        f.close()
    elif SCREENSHOT_WAY == 0:
        os.system('adb shell screencap -p /sdcard/cai.png')
        os.system('adb pull /sdcard/cai.png .')

文字识别
sudo pip3 install pytesseract
sudo apt-get install tesseract-ocr

初级版本效果：

题外话：

最近在浏览FB站看到

冲顶大会辅助揭秘：王思聪撒的币，还是要靠技术来捡

文中提到可以提前10秒得到题目（不知是否属实），由于访问权限不能看，如有知道怎么搞的请留言交流下，谢谢

相关阅读:
爬虫header和cookie
爬虫代理squid
response对象
 pyspider中内容选择器常用方法汇总
 非阻塞 sleep
post请求体过大导致ngx.req.get_post_args()取不到参数体的问题
 常用lua代码块
 nginx静态文件缓存的解决方案
 lua-resty-gearman模块
 非在线PDF转图片！！！
原文地址：https://www.cnblogs.com/lanqie/p/8290590.html