• python3入门教程


    python : 3.5

    jdk : 1.7

    eclipse : 4.5.2(有点低了,需要对应Neon 4.6,不然总是会弹出提示框)

    应该学习最新版本的 Python 3 还是旧版本的 Python 2.7?

    MySqlDB官网只支持Python3.4,这里Python3.5使用第三方库PyMysql连接Mysql数据库。

    http://dev.mysql.com/downloads/connector/python/2.0.html

    image

    PyMysql下载地址:

    https://pypi.python.org/pypi/PyMySQL#downloads

    Windows下安装方法:

    下载解压后,进入PyMySql-0.6.7目录,执行python setup.py install安装

    image

    test1.py

     1 import urllib.request as request
     2 def baidu_tieba(url, begin_page, end_page):
     3     for i in range(begin_page, end_page + 1):
     4         sName = 'D:/360Downloads/test/'+str(i).zfill(5)+'.html'
     5         print('正在下载第'+str(i)+'个页面, 并保存为'+sName)
     6         m = request.urlopen(url+str(i)).read()
     7         with open(sName,'wb') as file:
     8             file.write(m)
     9         file.close()
    10 if __name__ == "__main__":
    11     url = "http://tieba.baidu.com/p/"
    12     begin_page = 1
    13     end_page = 3
    14     baidu_tieba(url, begin_page, end_page)

    test2.py

     1 import urllib.request as request
     2 import re
     3 import os
     4 import urllib.error as error
     5 def baidu_tieba(url, begin_page, end_page):
     6     count = 1
     7     for i in range(begin_page, end_page + 1):
     8         sName = 'D:/360Downloads/test/' + str(i).zfill(5) + '.html'
     9         print('正在下载第' + str(i) + '个页面, 并保存为' + sName)
    10         m = request.urlopen(url + str(i)).read()
    11         # 创建目录保存每个网页上的图片
    12         dirpath = 'D:/360Downloads/test/'
    13         dirname = str(i)
    14         new_path = os.path.join(dirpath, dirname)
    15         if not os.path.isdir(new_path):
    16             os.makedirs(new_path)
    17         page_data = m.decode('gbk', 'ignore')   
    18         page_image = re.compile('<img src="(.+?)"')
    19         for image in page_image.findall(page_data):
    20             pattern = re.compile(r'^http://.*.png$')
    21             if  pattern.match(image):
    22                 try:
    23                     image_data = request.urlopen(image).read()
    24                     image_path = dirpath + dirname + '/' + str(count) + '.png'
    25                     count += 1
    26                     print(image_path)
    27                     with open(image_path, 'wb') as image_file:
    28                         image_file.write(image_data)
    29                     image_file.close()
    30                 except error.URLError as e:
    31                     print('Download failed')
    32         with open(sName, 'wb') as file:
    33             file.write(m)
    34         file.close()
    35 if __name__ == "__main__":
    36     url = "http://tieba.baidu.com/p/"
    37     begin_page = 1
    38     end_page = 3
    39     baidu_tieba(url, begin_page, end_page)

    test3.py

     1 #python3.4 爬虫教程
     2 #爬取网站上的图片
     3 #林炳文Evankaka(博客:http://blog.csdn.net/evankaka/)
     4 import urllib.request  
     5 import socket  
     6 import re  
     7 import sys  
     8 import os  
     9 targetDir = r"D:PythonWorkPlaceload"  #文件保存路径
    10 def destFile(path):  
    11     if not os.path.isdir(targetDir):  
    12         os.makedirs(targetDir)  
    13     pos = path.rindex('/')  
    14     t = os.path.join(targetDir, path[pos+1:])  
    15     print(t)
    16     return t  
    17 if __name__ == "__main__":  #程序运行入口
    18     weburl = "http://www.douban.com/"
    19     webheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} 
    20     req = urllib.request.Request(url=weburl, headers=webheaders)  #构造请求报头
    21     webpage = urllib.request.urlopen(req)  #发送请求报头
    22     contentBytes = webpage.read()  
    23     for link, t in set(re.findall(r'(https:[^s]*?(jpg|png|gif))', str(contentBytes))):  #正则表达式查找所有的图片
    24         print(link)
    25         try: 
    26             urllib.request.urlretrieve(link, destFile(link)) #下载图片
    27         except:
    28             print('失败') #异常抛出

    test4.py

     1 '''
     2 第一个示例:简单的网页爬虫
     3  
     4 爬取豆瓣首页
     5 '''
     6  
     7 import urllib.request
     8  
     9 #网址
    10 url = "http://bj.58.com/caishui/28707491160259x.shtml?adtype=1&entinfo=28707491160259_0&adact=3&psid=156713756196890928513274724"
    11  
    12 #请求
    13 request = urllib.request.Request(url)
    14  
    15 #爬取结果
    16 response = urllib.request.urlopen(request)
    17  
    18 data = response.read()
    19  
    20 #设置解码方式
    21 data = data.decode('utf-8')
    22  
    23 #打印结果
    24 print(data)
    25  
    26 #打印爬取网页的各类信息
    27  
    28 # print(type(response))
    29 # print(response.geturl())
    30 # print(response.info())
    31 # print(response.getcode())

    test5.py

     1 #!/usr/bin/env python
     2 #-*-coding: utf-8 -*-
     3 import re
     4 import urllib.request as request
     5 from bs4 import BeautifulSoup as bs
     6 import csv
     7 import os
     8 import sys
     9 from imp import reload 
    10 reload(sys)
    11  
    12 def GetAllLink():
    13     num = int(input("爬取多少页:>"))
    14     if not os.path.exists('./data/'):
    15         os.mkdir('./data/')
    16      
    17     for i in range(num):
    18         if i+1 == 1:
    19             url = 'http://nj.58.com/piao/'
    20             GetPage(url, i)
    21         else:
    22             url = 'http://nj.58.com/piao/pn%s/' %(i+1)
    23             GetPage(url, i)
    24  
    25  
    26 def GetPage(url, num):
    27     Url = url
    28     user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0'
    29     headers = { 'User-Agent' : user_agent }
    30     req = request.Request(Url, headers = headers)
    31     page = request.urlopen(req).read().decode('utf-8')
    32     soup = bs(page, "html.parser")
    33     table = soup.table
    34     tag = table.find_all('tr')
    35     # 提取出所需的那段
    36     soup2 = bs(str(tag), "html.parser")
    37     title = soup2.find_all('a','t')         #标题与url 
    38     price = soup2.find_all('b', 'pri')      #价格
    39     fixedprice = soup2.find_all('del')      #原价
    40     date = soup2.find_all('span','pr25')    #时间 
    41  
    42     atitle = []
    43     ahref = []
    44     aprice = []
    45     afixedprice = []
    46     adate = []
    47  
    48     for i in title:
    49         #print i.get_text(), i.get('href')
    50         atitle.append(i.get_text())
    51         ahref.append(i.get('href'))
    52     for i in price:
    53         #print i.get_text()
    54         aprice.append(i.get_text())
    55     for i in fixedprice:
    56         #print j.get_text()
    57         afixedprice.append(i.get_text())
    58     for i in date:
    59         #print i.get_text()
    60         adate.append(i.get_text())
    61 
    62     csvfile = open('./data/ticket_%s.csv'%num, 'w')
    63     writer = csv.writer(csvfile)
    64     writer.writerow(['标题','url','售价','原价','演出时间'])
    65     '''
    66     每个字段必有title,但是不一定有时间date
    67     如果没有date日期,我们就设为'---'
    68     '''
    69     if len(atitle) > len(adate):
    70         for i in range(len(atitle) - len(adate)):
    71             adate.append('---')
    72         for i in range(len(atitle) - len(afixedprice)):
    73             afixedprice.append('---')
    74         for i in range(len(atitle) - len(aprice)):
    75             aprice.append('---')
    76             
    77     for i in range(len(atitle)):
    78             message = atitle[i]+'|'+ahref[i]+'|'+aprice[i]+ '|'+afixedprice[i]+'|'+ adate[i]
    79             writer.writerow([i for i in str(message).split('|')])
    80     print ("[Result]:> 页面 %s 信息保存完毕!"%(num+1))
    81     csvfile.close()
    82  
    83  
    84 if __name__ == '__main__':
    85     GetAllLink()

    test6.py

     1 #!/usr/bin/env python
     2 #-*-coding: utf-8 -*-
     3 import urllib.request as request
     4 from bs4 import BeautifulSoup as bs
     5 import sys
     6 from imp import reload 
     7 reload(sys)
     8  
     9 def GetAllLink():
    10     num = int(input("爬取多少页:>"))
    11      
    12     for i in range(num):
    13         if i+1 == 1:
    14             url = 'http://bj.58.com/caishui/?key=%E4%BB%A3%E7%90%86%E8%AE%B0%E8%B4%A6%E5%85%AC%E5%8F%B8&cmcskey=%E4%BB%A3%E7%90%86%E8%AE%B0%E8%B4%A6%E5%85%AC%E5%8F%B8&final=1&jump=1&specialtype=gls'
    15             GetPage(url, i)
    16         else:
    17             url = 'http://bj.58.com/caishui/pn%s/'%(i+1)+'?key=%E4%BB%A3%E7%90%86%E8%AE%B0%E8%B4%A6%E5%85%AC%E5%8F%B8&cmcskey=%E4%BB%A3%E7%90%86%E8%AE%B0%E8%B4%A6%E5%85%AC%E5%8F%B8&final=1&specialtype=gls&PGTID=0d30215f-0000-1941-5161-367b7a641048&ClickID=4' 
    18             GetPage(url, i)
    19  
    20  
    21 def GetPage(url, num):
    22     Url = url
    23     user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0'
    24     headers = { 'User-Agent' : user_agent }
    25     req = request.Request(Url, headers = headers)
    26     page = request.urlopen(req).read().decode('utf-8')
    27     soup = bs(page, "html.parser")
    28     table = soup.table
    29     tag = table.find_all('tr')
    30     
    31     # 提取出所需的那段
    32     soup2 = bs(str(tag), "html.parser")
    33     
    34     title = soup2.find_all('a','t')         #标题与url 
    35     companyName = soup2.find_all('a','sellername') #公司名称
    36  
    37     atitle = []
    38     ahref = []
    39     acompanyName = []
    40  
    41     for i in title:
    42         atitle.append(i.get_text())
    43         ahref.append(i.get('href'))
    44     for i in companyName:
    45         acompanyName.append(i.get_text())
    46     for i in range(len(ahref)):
    47         getSonPage(str(ahref[i]))
    48         
    49             
    50 def getSonPage(url): 
    51     Url = url
    52     user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0'
    53     headers = { 'User-Agent' : user_agent }
    54     req = request.Request(Url, headers = headers)
    55     page = request.urlopen(req).read().decode('utf-8')
    56     soup = bs(page, "html.parser")
    57     print("=========================")
    58     #类别
    59     print(soup.find('div','su_con').get_text())
    60     #服务区域
    61     print(soup.find('div','su_con quyuline').get_text())
    62     #联&nbsp;系&nbsp;人
    63     print(soup.find_all('ul','suUl')[0].find_all('li')[2].find_all('a')[0].get_text())
    64     #商家地址
    65     print(soup.find_all('ul','suUl')[0].find_all('li')[3].find('div','su_con').get_text().replace("
    ",'').replace("
    ",'').replace('	','').replace('&nbsp;',''))
    66     #服务项目
    67     print(soup.find('article','description_con').get_text().replace("_____________________________________","
    
    ").replace("___________________________________","
    
    ").replace("(以下为公司北京区域分布图)",""))
    68     print("=========================")
    69  
    70 if __name__ == '__main__':
    71     GetAllLink()

    test7.py

    1 import pymysql
    2 conn = pymysql.connect(host='192.168.1.102', port=3306,user='root',passwd='123456',db='test',charset='UTF8')
    3 cur = conn.cursor()
    4 cur.execute("select version()")
    5 for i in cur:
    6     print(i)
    7 cur.close()
    8 conn.close()
  • 相关阅读:
    【HDU
    写个shell脚本依次运行每个程序半小时
    Windows10 + Visual Studio 2017环境为C++工程安装使用ZMQ
    【UVALive
    【Gym
    【最短路算法】Dijkstra+heap和SPFA的区别
    【Gym 100812C】Story of Princess (走完图所有边)
    【C++】VS2015/VS2017连接Mysql数据库教程
    博客园设置Google-code-prettify渲染代码高亮
    【QML与C++混合编程】用QVariantList传递数组类型成员
  • 原文地址:https://www.cnblogs.com/yanduanduan/p/7307068.html
Copyright © 2020-2023  润新知