1.TCP
#1 服务端 server.py
import socket
host = '127.0.0.1' # 设置ip
port = 9000 # 设置端口
# 创建socket对象
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind((host, port)) # 绑定ip和端口
s.listen(5) # 等待客户端连接
print("开始监听...")
while True:
c, addr = s.accept() # 建立客户端连接
print('客户端地址:', addr)
data = c.recv(2048)
print("消息:", data.decode('utf-8'))
c.send(b'Welcome to connect!')
c.close() # 关闭连接
#2 客户端 client.py
import socket
host = '127.0.0.1'
port = 9000
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect((host, port))
s.send(b'Hello')
data, addr = s.recvfrom(1024)
print(data.decode('utf-8'))
s.close()
2.UDP
#1 server.py
import socket
host = '127.0.0.1' # 设置ip
port = 9000 # 设置端口
# 创建socket对象
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.bind((host, port))
while True:
data, addr = s.recvfrom(1024)
print('server收到来自 {} 的消息:'.format(addr), data)
s.sendto(data.upper(), addr)
s.close()
#2 client.py
import socket
host = '127.0.0.1' # 设置ip
port = 9000 # 设置端口
c = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
c.sendto(b'hello', (host, port))
data, addr = c.recvfrom(1024)
print('客户端收到来自 {} 的消息:'.format(addr), data)
c.close()
3.requests
import requests
#1 返回值
r = requests.get('https://www.baidu.com')
r.status_code # 状态码
r.text # HTML源码
r.content # 网页二进制内容
r.json # json文件
r.headers # 返回的headers
r.request.headers # 请求的headers
r.cookies # 获取cookies
r.encoding # 从header中猜测的编码
r.apparent_encoding # 从内容分析的编码
r.raise_for_status() # 状态码不是200触发异常
#2 get请求
headers = {'User-Agent': 'Mozilla/5.0'}
payload = {'key': 'value'}
cookies = dict(mycookies='mycookies')
requests.get(url, headers=headers, params=payload, cookies=cookies, timeout=5)
#3 post请求
payload = {'key': 'value'}
requests.post(url, data=payload)
requests.post(url, data=json.dumps(payload))
files = {'file': open('test.txt', 'rb')}
requests.post(url, files=files) # 上传文件
#4 proxies代理
proxies = {'https': 'http://x.x.x.x:8000'}
requests.post(url, proxies=proxies)
4.selenium
# pip install selenium
# 下载ChromeDriver,放置在代码同目录下即可
#1 基本使用
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
browser = webdriver.Chrome()
browser.get("https://www.baidu.com")
print(browser.current_url)
print(browser.get_cookies())
print(browser.page_source) # 获取HTML源码
# 通过id获取元素
element = browser.find_element_by_id("kw")
print(element.id) # 获取ID
print(element.text) # 获取文本
print(element.location) # 获取位置
print(element.tag_name) # 获取标签名
print(element.size) # 获取大小
print(element.get_attribute('class')) # 获取属性
# 向元素输入内容
element.send_keys("Python")
element.send_keys(Keys.ENTER) # 模拟回车键
element.clear() # 清除内容
# 通过Xpath获取元素
element = browser.find_element(By.XPATH, '//*[@id="su"]')
element.click() # 模拟点击
#2 wait等待
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
browser.set_window_size(1400, 900)
# 实现页面等待的对象
wait = WebDriverWait(browser, 10)
def search(url):
try:
browser.get(url)
# 等待指定的元素加载
element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#kw")))
except TimeoutException:
search(url)
search('https://www.baidu.com')
5.BeautifulSoup
from bs4 import BeautifulSoup
import re
html_doc = '''
<!DOCTYPE html>
<html><head><title>百度一下,你就知道 </title></head>
<body>
<div id="u1">
<a class="mnav" href="http://news.baidu.com" id="news">新闻</a>
<a class="mnav" href="https://www.hao123.com">hao123</a>
<a class="mnav" href="http://map.baidu.com" id="map">地图</a>
<a class="mnav" href="http://v.baidu.com" id="video">视频</a>
<a class="mnav" href="http://tieba.baidu.com" id="tieba">贴吧</a>
</div>
</body>
</html>
'''
#1 创建BeautifulSoup对象
soup = BeautifulSoup(html_doc, 'html.parser')
# 获取格式化html内容
print(soup.prettify())
#2 获取第一个匹配标签a的内容
print(soup.a)
# <a class="mnav" href="http://news.baidu.com" id="news">新闻</a>
# 返回标签a属性class的值
print(soup.a['class'])
# ['mnav']
# 返回标签a属性的字典
print(soup.a.attrs)
# {'class': ['mnav'], 'href': 'http://news.baidu.com', 'id': 'news'}
# 返回标签a的内容
print(soup.a.string)
# 新闻
#3 CSS选择器,返回列表
print(soup.select('#map'))
# [<a class="mnav" href="http://map.baidu.com" id="map">地图</a>]
print(soup.select('div a[id="map"]'))
# [<a class="mnav" href="http://map.baidu.com" id="map">地图</a>]
#4 find_all()搜索
print(soup.find_all('a', id='video'))
# [<a class="mnav" href="http://v.baidu.com" id="video">视频</a>]
print(soup.find_all(re.compile("^a")))
# [<a class="mnav" href="http://news.baidu.com" id="news">新闻</a>, <a class="mnav" href="https://www.hao123.com">hao123</a>, <a class="mnav" href="http://map.baidu.com" id="map">地图</a>, <a class="mnav" href="http://v.baidu.com" id="video">视频</a>, <a class="mnav" href="http://tieba.baidu.com" id="tieba">贴吧</a>]
# find_all()搜索文档中的字符串
print(soup.find_all(text=re.compile("^贴")))
# ['贴吧']
# find_all()传自定义方法搜索
def func(tag):
return tag.has_attr('class') and not tag.has_attr('id')
print(soup.find_all(func))
# [<a class="mnav" href="https://www.hao123.com">hao123</a>]
6.PyQuery
from pyquery import PyQuery as pq
from lxml import etree
html_doc = '''
<!DOCTYPE html>
<html><head><title>百度一下,你就知道 </title></head>
<body>
<div id="u1">
<a class="mnav" href="http://news.baidu.com" id="news">新闻</a>
<a class="mnav" href="https://www.hao123.com">hao123</a>
<a class="mnav" href="http://map.baidu.com" id="map">地图</a>
<a class="mnav" href="http://v.baidu.com" id="video">视频</a>
<a class="mnav" href="http://tieba.baidu.com" id="tieba">贴吧</a>
</div>
</body>
</html>
'''
#1 初始化
doc = pq(etree.fromstring(html_doc))
#2 CSS选择器
item = doc('#news')
print(item.attr('href'))
# http://news.baidu.com
print(item.text())
# 新闻
#3 链式使用
items = doc('#u1').find('a').items()
for item in items:
print(item.text())
# 新闻 hao123 地图 视频 贴吧