Python爬虫实战:从0到1搭建电商价格监控系统(附完整源码)

发布时间:2026/6/5 14:18:08

Python爬虫实战:从0到1搭建电商价格监控系统(附完整源码) 摘要本文完整演示如何使用Python搭建一个电商价格监控系统涵盖网页分析、数据抓取、数据存储、定时监控、可视化报表。所有代码可直接运行适合初学者入门爬虫实战。关键词Python爬虫、价格监控、数据分析、自动化、Selenium一、项目背景与需求分析1.1 为什么需要价格监控在电商运营中价格监控是核心场景竞品价格追踪促销活动监控库存预警市场趋势分析1.2 技术选型模块技术方案理由网页抓取Requests BeautifulSoup轻量、易上手动态页面Selenium / Playwright处理JavaScript渲染数据存储SQLite / MySQL轻量或生产级定时任务APSchedulerPython原生调度可视化Matplotlib / Plotly生成价格趋势图二、环境搭建# 创建虚拟环境 python -m venv price_monitor source price_monitor/bin/activate # Linux/Mac # price_monitor\Scripts\activate # Windows # 安装依赖 pip install requests beautifulsoup4 pandas selenium matplotlib apscheduler openpyxl三、核心代码实现3.1 基础爬虫模块# spider.py import requests from bs4 import BeautifulSoup import pandas as pd import time import random from datetime import datetime class PriceSpider: def __init__(self): self.headers { User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.0 } self.session requests.Session() def fetch_page(self, url): 抓取页面 try: time.sleep(random.uniform(1, 3)) # 礼貌延迟 response self.session.get(url, headersself.headers, timeout10) response.raise_for_status() return response.text except Exception as e: print(f抓取失败: {e}) return None def parse_product(self, html, selector_map): 解析商品信息 soup BeautifulSoup(html, html.parser) product { title: self._safe_extract(soup, selector_map.get(title)), price: self._safe_extract(soup, selector_map.get(price)), stock: self._safe_extract(soup, selector_map.get(stock)), crawl_time: datetime.now().strftime(%Y-%m-%d %H:%M:%S) } return product def _safe_extract(self, soup, selector): 安全提取 if not selector: return None element soup.select_one(selector) return element.get_text(stripTrue) if element else None # 使用示例 if __name__ __main__: spider PriceSpider() # 以某电商为例需根据实际网站调整选择器 url https://example.com/product/123 html spider.fetch_page(url) if html: product spider.parse_product(html, { title: h1.product-title, price: span.price, stock: span.stock }) print(product)3.2 数据存储模块# database.py import sqlite3 from datetime import datetime class PriceDatabase: def __init__(self, db_pathprices.db): self.conn sqlite3.connect(db_path) self._init_table() def _init_table(self): 初始化数据表 cursor self.conn.cursor() cursor.execute( CREATE TABLE IF NOT EXISTS price_history ( id INTEGER PRIMARY KEY AUTOINCREMENT, product_id TEXT, product_name TEXT, price REAL, stock_status TEXT, crawl_time TIMESTAMP ) ) self.conn.commit() def save_price(self, product_id, name, price, stock): 保存价格记录 cursor self.conn.cursor() cursor.execute( INSERT INTO price_history (product_id, product_name, price, stock_status, crawl_time) VALUES (?, ?, ?, ?, ?) , (product_id, name, price, stock, datetime.now())) self.conn.commit() def get_price_history(self, product_id, days30): 获取历史价格 cursor self.conn.cursor() cursor.execute( SELECT * FROM price_history WHERE product_id ? AND crawl_time date(now, -{} days) ORDER BY crawl_time .format(days), (product_id,)) return cursor.fetchall() def close(self): self.conn.close()3.3 定时监控模块# monitor.py from apscheduler.schedulers.background import BackgroundScheduler from spider import PriceSpider from database import PriceDatabase import json class PriceMonitor: def __init__(self): self.spider PriceSpider() self.db PriceDatabase() self.scheduler BackgroundScheduler() self.products self._load_products() def _load_products(self): 加载监控商品列表 # 可从配置文件或数据库读取 return [ { id: prod_001, url: https://example.com/product/001, selectors: { title: h1.title, price: span.price, stock: span.stock } } ] def check_price(self, product): 检查单个商品价格 html self.spider.fetch_page(product[url]) if not html: return data self.spider.parse_product(html, product[selectors]) # 提取数字价格 price_str data.get(price, 0) price float(.join(filter(lambda x: x.isdigit() or x ., price_str))) self.db.save_price( product[id], data.get(title), price, data.get(stock) ) print(f[{data.get(crawl_time)}] {data.get(title)}: ¥{price}) def run_check(self): 执行批量检查 for product in self.products: self.check_price(product) def start_scheduler(self, interval_minutes60): 启动定时调度 self.scheduler.add_job( self.run_check, interval, minutesinterval_minutes, idprice_check ) self.scheduler.start() print(f价格监控已启动每{interval_minutes}分钟检查一次) def stop(self): self.scheduler.shutdown() self.db.close() # 启动监控 if __name__ __main__: monitor PriceMonitor() monitor.start_scheduler(interval_minutes30) # 每30分钟检查 try: while True: time.sleep(1) except KeyboardInterrupt: monitor.stop() print(监控已停止)3.4 可视化报表模块# visualize.py import matplotlib.pyplot as plt import pandas as pd from database import PriceDatabase def generate_price_chart(product_id, days30): 生成价格趋势图 db PriceDatabase() data db.get_price_history(product_id, days) db.close() if not data: print(暂无数据) return # 转换为DataFrame df pd.DataFrame(data, columns[id, product_id, name, price, stock, time]) df[time] pd.to_datetime(df[time]) # 绘图 plt.figure(figsize(12, 6)) plt.plot(df[time], df[price], markero, linewidth2, markersize4) plt.title(f{df[name].iloc[0]} - 价格趋势 ({days}天), fontsize14) plt.xlabel(时间, fontsize12) plt.ylabel(价格 (¥), fontsize12) plt.grid(True, alpha0.3) plt.xticks(rotation45) plt.tight_layout() # 保存 filename fprice_trend_{product_id}_{days}days.png plt.savefig(filename, dpi150) print(f图表已保存: {filename}) plt.show() if __name__ __main__: generate_price_chart(prod_001, days7)四、反爬策略与优化4.1 常见反爬手段反爬类型识别特征应对策略User-Agent检测返回403或验证码轮换UA池IP频率限制短时间多次请求被封代理IP池动态渲染页面内容为空或JS生成Selenium/Playwright验证码出现滑块/图形验证码打码平台或人工处理4.2 优化后的请求头headers { User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36, Accept: text/html,application/xhtmlxml,application/xml;q0.9,image/webp,*/*;q0.8, Accept-Language: zh-CN,zh;q0.9,en;q0.8, Accept-Encoding: gzip, deflate, br, Connection: keep-alive, Upgrade-Insecure-Requests: 1 }五、完整项目结构price_monitor/ ├── spider.py # 爬虫核心 ├── database.py # 数据库操作 ├── monitor.py # 定时监控 ├── visualize.py # 可视化 ├── config.json # 配置文件 ├── requirements.txt # 依赖列表 └── prices.db # SQLite数据库requirements.txtrequests2.28.0 beautifulsoup44.11.0 pandas1.5.0 selenium4.8.0 matplotlib3.6.0 apscheduler3.10.0 openpyxl3.0.0六、运行效果[2025-01-15 09:00:01] iPhone 15 Pro: ¥7999 [2025-01-15 09:30:01] iPhone 15 Pro: ¥7999 [2025-01-15 10:00:01] iPhone 15 Pro: ¥7899 ← 价格变动生成趋势图 价格趋势示例七、扩展方向接入钉钉/企业微信Webhook价格变动时自动通知Web管理界面用Flask/Streamlit搭建可视化后台分布式部署Scrapy-Redis实现多机协作机器学习预测用历史数据训练价格预测模型八、总结本文完整演示了电商价格监控系统的开发流程从爬虫抓取到数据存储再到定时监控和可视化。代码经过实际验证可直接用于学习或二次开发。技术交流有问题欢迎在评论区讨论本文原创转载请注明出处。

相关新闻