python 爬虫 - 润新知

python 爬虫

#!/usr/bin/python3

# -*- coding: UTF-8 -*-

import urllib

from urllib.parse import urlencode

from urllib.request import Request, urlopen

import re

import time

import os

import mysql.connector

times = 0

def saveDownedurl(downedurl):

    url = downedurl

    conn = mysql.connector.connect(user='root', password='694521', database='picurl')

    cursor = conn.cursor()

    sql = "INSERT INTO downedurl (picurl) VALUES (%s)"

    cursor.execute(sql,[url])

    conn.commit()

    print(cursor.rowcount, "记录插入成功。")

    conn.close()

    # sql = "INSERT INTO downedurl (picurl) VALUES (url)"

    # cursor.execute(sql)

    # conn.commit()

    # print(cursor.rowcount, "记录插入成功。")

    # conn.close()

def download_pic(pic_url,root_url,down_times):

     url = pic_url

     Referer = root_url

     down_time = down_times

     headers = {

     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0',

     'Referer':Referer

     }

     down_path = str(down_time)+'.jpg'

     print (down_path)

     requests = Request(url, headers=headers)

     data = urlopen(requests).read()

     with open(down_path, 'wb') as f:

          f.write(data)

          f.close()

     down_time+=1

     return down_time

def jiexi_rootPic_url(next_rootUrl,down_times):

     url = next_rootUrl

     headers = {

     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'

     }

     downtime = down_times

     request_url = Request(url, headers=headers)

     response = urlopen(request_url).read().decode("utf-8")

     pattern = re.compile('<img src="(.*?)"', re.IGNORECASE)

     pic_path =  pattern.findall(response)

     for i in pic_path:

          print ('download_prepare')

          downtime = download_pic(i,url,downtime)

          print(i)

     time.sleep(2)

     return downtime

def jiexi_url(root_url,down_times):

     headers = {

     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'

     }

     downtime = down_times

     url = root_url

     request_url = Request(url, headers=headers)

     html = urlopen(request_url).read().decode("utf-8")

     response = re.compile('/rnyy(.*?).html', re.IGNORECASE)

     all_next_root =  response.findall(html)

     for i in all_next_root:

          path = 'http://mmff30.com/rnyy'+i+'.html'

          print (path)

          saveDownedurl(path)

          downtime = jiexi_rootPic_url(path,downtime)

jiexi_url('http://mmff30.com/rwmy_9_3.html',4000)
相关阅读:
Java线程中run和start方法的区别
 dwr+spring集成
 Lucene入门
 struts2之单个文件上传
 利用jQuery接受和处理xml数据
 struts2之多个文件上传
 Google开源项目二维码读取与生成工具ZXing
C# Regex 深入正则表达式
 android多分辨率多密度下界面适配方案
 [转]C#.net编程创建 Access 文件和 Excel 文件
原文地址：https://www.cnblogs.com/ytCui/p/13055992.html