?q=python&page=1
User-Agent
、Content-Type
、Authorization
import requests
url = "https://httpbin.org/get"
params = {"q": "python", "page": 1}
headers = {"User-Agent": "CS106A-Client/1.0"}
r = requests.get(url, params=params, headers=headers, timeout=5)
r.raise_for_status()
print(r.status_code)
print(r.url)
print(r.json())
import requests
# 表单提交
r1 = requests.post("https://httpbin.org/post", data={"name": "Alice"})
print(r1.json()["form"]) # {'name': 'Alice'}
# JSON提交
r2 = requests.post("https://httpbin.org/post", json={"name": "Bob"})
print(r2.json()["json"]) # {'name': 'Bob'}
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
session = requests.Session()
retry = Retry(total=3, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", HTTPAdapter(max_retries=retry))
try:
resp = session.get("https://httpbin.org/status/503", timeout=5)
resp.raise_for_status()
except requests.exceptions.RequestException as e:
print("request error:", e)
raise_for_status()
、合理的 timeout
与重试策略;避免无限重试。import requests
from bs4 import BeautifulSoup
resp = requests.get("https://example.com", headers={"User-Agent": "CS106A"}, timeout=5)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
# 提取标题与所有链接
title = soup.find("title").get_text(strip=True)
links = [a.get("href") for a in soup.find_all("a") if a.get("href")]
print(title)
print(links[:10])
robots.txt
与网站使用条款;不要高频抓取User-Agent
、合理 sleep
与失败重试import socket
with socket.create_connection(("example.com", 80), timeout=5) as s:
req = (
"GET / HTTP/1.1\r\n"
"Host: example.com\r\n"
"Connection: close\r\n\r\n"
)
s.sendall(req.encode("ascii"))
data = b""
while True:
chunk = s.recv(4096)
if not chunk:
break
data += chunk
print(data[:200])
import socket
def run_echo_server(host="127.0.0.1", port=5000):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as srv:
srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
srv.bind((host, port))
srv.listen()
print(f"listening on {host}:{port}")
while True:
conn, addr = srv.accept()
with conn:
print("connected:", addr)
while True:
buf = conn.recv(1024)
if not buf:
break
conn.sendall(buf)
# run_echo_server()
import socketserver
class EchoHandler(socketserver.BaseRequestHandler):
def handle(self):
while True:
data = self.request.recv(1024)
if not data:
break
self.request.sendall(data)
# with socketserver.TCPServer(("127.0.0.1", 5001), EchoHandler) as server:
# server.serve_forever()
import asyncio
import aiohttp
async def fetch(session, url):
async with session.get(url, timeout=5) as resp:
resp.raise_for_status()
return await resp.text()
async def main():
urls = ["https://example.com" for _ in range(5)]
async with aiohttp.ClientSession() as session:
texts = await asyncio.gather(*[fetch(session, u) for u in urls])
print(len(texts))
# asyncio.run(main())
import requests
url = "https://speed.hetzner.de/100MB.bin"
with requests.get(url, stream=True, timeout=10) as r:
r.raise_for_status()
with open("file.bin", "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
import requests
API = "https://httpbin.org/get"
params = {"city": "Beijing"}
resp = requests.get(API, params=params, timeout=5)
resp.raise_for_status()
print(resp.json())
# 思路:服务器维护连接集合,广播收到的消息到所有连接
# 可用select/asyncio/threading实现;此处给出框架思路
response.encoding
或 response.apparent_encoding
verify=False
可临时跳过(不建议),或指定 cert
HTTP_PROXY
/HTTPS_PROXY
环境变量或 proxies
参数import requests
try:
r = requests.get("https://expired.badssl.com/", timeout=5)
r.raise_for_status()
except requests.exceptions.SSLError as e:
print("SSL error:", e)
requests
获取一个网页并保存到文件;打印前200字符。backoff_factor
)。抓取指定站点的若干页面,解析标题和链接,尊重robots、设置UA与限速,导出结果。
封装一个API客户端,支持鉴权、重试、超时、分页与错误处理,输出统计报表。
实现一个可靠的回声/聊天室服务器,支持并发连接、日志与异常恢复。