FA3-Datafetch/legacy/szse_scraper.py
2026-01-03 18:27:19 +08:00

155 lines
5.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import json
import time
from datetime import datetime, timedelta
"""
脚本名称: 深圳证券交易所公告抓取器 (SZSE Disclosure Scraper)
描述: 该脚本用于从深圳证券交易所官网抓取指定股票的公告信息。
支持分页抓取并自动过滤最近6个月内的公告。
抓取的信息包括发布时间、股票代码、公告标题、PDF文档ID、公告详情链接。
依赖:
- requests
安装依赖:
pip install requests
使用方法:
1. 修改脚本底部的 `stock` 变量为目标股票代码 (例如 "002003")。
2. 在终端运行: python szse_scraper.py
注意:
- 深交所接口可能对请求频率有限制,脚本中已包含延时 (time.sleep)。
"""
def fetch_szse_disclosures(stock_code, page_num=1):
"""
抓取单页公告数据
参数:
stock_code (str): 股票代码 (如 "002003")
page_num (int): 页码,默认为 1
返回:
list: 公告数据列表,如果出错或无数据则返回空列表
"""
# 深交所公告列表接口
url = "http://www.szse.cn/api/disc/announcement/annList"
# 伪装请求头,防止被反爬
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Content-Type": "application/json",
"Accept": "application/json, text/javascript, */*; q=0.01",
"Referer": f"http://www.szse.cn/disclosure/listed/notice/index.html?stock={stock_code}",
"X-Request-Type": "ajax",
"X-Requested-With": "XMLHttpRequest"
}
# 请求参数
# channelCode: listedNotice_disc (上市公司公告)
# pageSize: 每页条数
payload = {
"seDate": ["", ""], # 日期范围,默认为空表示不限制
"stock": [stock_code],
"channelCode": ["listedNotice_disc"],
"pageSize": 30,
"pageNum": page_num
}
try:
response = requests.post(url, headers=headers, json=payload, timeout=10)
response.raise_for_status()
data = response.json()
# 检查返回数据结构
if "data" in data and data["data"]:
return data["data"]
else:
return []
except Exception as e:
print(f"抓取第 {page_num} 页数据出错: {e}")
return []
def parse_and_print(data):
"""
解析并打印公告信息
参数:
data (list): fetch_szse_disclosures 返回的公告列表
"""
# 打印表头
print(f"{'时间':<20} {'股票':<10} {'标题':<40} {'文档ID':<20} {'链接'}")
print("-" * 120)
for item in data:
# 提取字段
publish_time = item.get("publishTime", "")
sec_code = item.get("secCode", "")
sec_name = item.get("secName", "")
title = item.get("title", "")
doc_id = item.get("id", "")
# 构造详情页链接
# 也可以构造直接下载PDF的链接: http://disc.static.szse.cn/download + attachPath
link = f"http://www.szse.cn/disclosure/listed/bulletinDetail/index.html?{doc_id}"
# 清理标题中的HTML标签 (如高亮标签)
title = title.replace('<span class="highlight">', '').replace('</span>', '')
# 打印单行数据
print(f"{publish_time:<20} {sec_name}({sec_code}) {title[:40]:<40} {doc_id:<20} {link}")
if __name__ == "__main__":
# 配置
stock = "002003" # 伟星股份
lookback_days = 180 # 回溯最近180天 (半年)
print(f"正在抓取 {stock} 最近 {lookback_days} 天的公告...")
all_results = []
page = 1
# 计算截止日期
cutoff_date = datetime.now() - timedelta(days=lookback_days)
stop_fetching = False
while not stop_fetching:
print(f"正在获取第 {page} 页...")
results = fetch_szse_disclosures(stock, page)
if not results:
print("该页无数据或已到达末尾。")
break
for item in results:
pub_time_str = item.get("publishTime")
if pub_time_str:
# 时间格式通常为 'YYYY-MM-DD HH:MM:SS'
try:
pub_time = datetime.strptime(pub_time_str, "%Y-%m-%d %H:%M:%S")
# 如果公告发布时间早于截止日期,则停止抓取
if pub_time < cutoff_date:
stop_fetching = True
break # 跳出当前页循环
all_results.append(item)
except ValueError:
# 如果日期格式解析失败,默认保留该条目
all_results.append(item)
if stop_fetching:
print("已达到时间限制,停止抓取。")
break
page += 1
time.sleep(1) # 礼貌延时,避免请求过快
if all_results:
print(f"\n共获取到 {len(all_results)} 条公告:")
parse_and_print(all_results)
else:
print("未找到符合条件的公告数据。")