手把手教你用Python爬网页数据:含翻页、动态页面、CSV导出全流程

发布时间:2026/6/5 17:29:19

手把手教你用Python爬网页数据:含翻页、动态页面、CSV导出全流程 在数据分析、自动化采集、竞品研究等场景中Python 爬虫非常常用。这篇文章将带你从 0 到 1 学会网页数据爬取并提供可直接复制运行的代码示例。一、爬虫基础概念Python 爬网页通常分两步请求网页使用requests获取 HTML 内容解析数据使用BeautifulSoup/lxml提取需要的信息对于 JavaScript 动态渲染页面还可以使用Selenium模拟浏览器加载后再提取数据。二、安装依赖先安装常用库pip install requests beautifulsoup4 lxml pandas selenium webdriver-manager三、最基础示例获取网页标题这是最简单的静态网页抓取例子import requests from bs4 import BeautifulSoup url https://example.com headers { User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 } resp requests.get(url, headersheaders, timeout10) resp.raise_for_status() # 请求失败会抛出异常 soup BeautifulSoup(resp.text, lxml) title soup.title.get_text(stripTrue) print(网页标题, title)四、实战案例爬取豆瓣电影 Top250含翻页 保存CSV经典学习案例适合入门练习。注意请遵守网站 robots 协议与访问频率限制。import requests from bs4 import BeautifulSoup import csv import time BASE_URL https://movie.douban.com/top250?start{} HEADERS { User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 } def fetch_page(start): url BASE_URL.format(start) resp requests.get(url, headersHEADERS, timeout10) resp.raise_for_status() return resp.text def parse_html(html): soup BeautifulSoup(html, lxml) items soup.select(div.item) data [] for item in items: rank item.select_one(em).get_text(stripTrue) title item.select_one(span.title).get_text(stripTrue) rating item.select_one(span.rating_num).get_text(stripTrue) quote_tag item.select_one(span.inq) quote quote_tag.get_text(stripTrue) if quote_tag else data.append([rank, title, rating, quote]) return data def save_csv(rows, filenamedouban_top250.csv): with open(filename, w, newline, encodingutf-8-sig) as f: writer csv.writer(f) writer.writerow([排名, 电影名, 评分, 短评]) writer.writerows(rows) def main(): all_rows [] # Top250 每页25条共10页 for start in range(0, 250, 25): try: html fetch_page(start) rows parse_html(html) all_rows.extend(rows) print(f已抓取 start{start}, 本页{len(rows)}条) time.sleep(1.5) # 礼貌爬取降低频率 except Exception as e: print(f抓取 start{start} 失败{e}) save_csv(all_rows) print(f完成共保存 {len(all_rows)} 条数据到 douban_top250.csv) if __name__ __main__: main()五、增强稳定性加入重试机制推荐网络波动、接口超时很常见建议对请求加自动重试。import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry def create_session(): session requests.Session() retry Retry( total5, # 最多重试5次 backoff_factor1, # 重试间隔递增: 1s, 2s, 4s... status_forcelist[429, 500, 502, 503, 504], allowed_methods[GET, HEAD] ) adapter HTTPAdapter(max_retriesretry) session.mount(http://, adapter) session.mount(https://, adapter) return session session create_session() resp session.get(https://example.com, timeout10) print(resp.status_code)六、动态网页抓取Selenium 示例如果requests.get(url).text看不到目标数据说明数据可能是 JS 动态加载。这时可以用 Selenium 模拟浏览器。1基础 Selenium 示例无头浏览器from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from webdriver_manager.chrome import ChromeDriverManager import time options webdriver.ChromeOptions() options.add_argument(--headlessnew) options.add_argument(--disable-gpu) options.add_argument(--window-size1920,1080) driver webdriver.Chrome(serviceService(ChromeDriverManager().install()), optionsoptions) try: driver.get(https://example.com) time.sleep(2) h1 driver.find_element(By.TAG_NAME, h1).text print(h1内容, h1) finally: driver.quit()2推荐方式显式等待from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC driver webdriver.Chrome() driver.get(https://example.com) try: elem WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.TAG_NAME, h1)) ) print(elem.text) finally: driver.quit()七、数据保存CSV / Excelimport pandas as pd data [ {name: A, price: 10}, {name: B, price: 20}, ] df pd.DataFrame(data) df.to_csv(result.csv, indexFalse, encodingutf-8-sig) df.to_excel(result.xlsx, indexFalse) print(已保存)八、通用爬虫模板建议收藏import requests from bs4 import BeautifulSoup import pandas as pd import time def get_html(url, headers, timeout10): resp requests.get(url, headersheaders, timeouttimeout) resp.raise_for_status() return resp.text def parse(html): soup BeautifulSoup(html, lxml) results [] # TODO: 按你的网页结构修改选择器 for item in soup.select(.item): title item.get_text(stripTrue) results.append({title: title}) return results def main(): headers { User-Agent: Mozilla/5.0 ... } urls [ https://example.com/page1, https://example.com/page2, ] all_data [] for url in urls: try: html get_html(url, headers) data parse(html) all_data.extend(data) print(f{url} - {len(data)}条) time.sleep(1) except Exception as e: print(f失败: {url}, 错误: {e}) pd.DataFrame(all_data).to_csv(output.csv, indexFalse, encodingutf-8-sig) print(完成) if __name__ __main__: main()九、常见问题与排查1403 Forbidden增加User-Agent、Referer降低请求频率使用Session保持 Cookie2抓不到目标数据多半是 JS 动态加载打开浏览器开发者工具Network → XHR/Fetch找真实接口3中文乱码可设置resp.encoding resp.apparent_encodingCSV 推荐encodingutf-8-sig4IP 被限制降速、随机间隔避免高并发轰炸合法、合规使用代理在授权范围内十、法律与合规提醒务必重视遵守网站robots.txt和服务条款不抓取隐私、敏感、受保护数据不进行攻击性高频请求数据仅用于合法合规用途学习、研究、授权业务结语爬虫核心不复杂请求 解析 存储。建议学习路径先掌握静态页面requests BeautifulSoup再学翻页、重试、异常处理最后处理动态网页Selenium或接口逆向按照上述学习路径你就可以轻松抓取网页数据啦

相关新闻