155 lines
5.1 KiB
Python
155 lines
5.1 KiB
Python
import requests
|
||
import json
|
||
import time
|
||
from datetime import datetime, timedelta
|
||
|
||
"""
|
||
脚本名称: 深圳证券交易所公告抓取器 (SZSE Disclosure Scraper)
|
||
描述: 该脚本用于从深圳证券交易所官网抓取指定股票的公告信息。
|
||
支持分页抓取,并自动过滤最近6个月内的公告。
|
||
抓取的信息包括:发布时间、股票代码、公告标题、PDF文档ID、公告详情链接。
|
||
|
||
依赖:
|
||
- requests
|
||
|
||
安装依赖:
|
||
pip install requests
|
||
|
||
使用方法:
|
||
1. 修改脚本底部的 `stock` 变量为目标股票代码 (例如 "002003")。
|
||
2. 在终端运行: python szse_scraper.py
|
||
|
||
注意:
|
||
- 深交所接口可能对请求频率有限制,脚本中已包含延时 (time.sleep)。
|
||
"""
|
||
|
||
def fetch_szse_disclosures(stock_code, page_num=1):
|
||
"""
|
||
抓取单页公告数据
|
||
|
||
参数:
|
||
stock_code (str): 股票代码 (如 "002003")
|
||
page_num (int): 页码,默认为 1
|
||
|
||
返回:
|
||
list: 公告数据列表,如果出错或无数据则返回空列表
|
||
"""
|
||
# 深交所公告列表接口
|
||
url = "http://www.szse.cn/api/disc/announcement/annList"
|
||
|
||
# 伪装请求头,防止被反爬
|
||
headers = {
|
||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||
"Content-Type": "application/json",
|
||
"Accept": "application/json, text/javascript, */*; q=0.01",
|
||
"Referer": f"http://www.szse.cn/disclosure/listed/notice/index.html?stock={stock_code}",
|
||
"X-Request-Type": "ajax",
|
||
"X-Requested-With": "XMLHttpRequest"
|
||
}
|
||
|
||
# 请求参数
|
||
# channelCode: listedNotice_disc (上市公司公告)
|
||
# pageSize: 每页条数
|
||
payload = {
|
||
"seDate": ["", ""], # 日期范围,默认为空表示不限制
|
||
"stock": [stock_code],
|
||
"channelCode": ["listedNotice_disc"],
|
||
"pageSize": 30,
|
||
"pageNum": page_num
|
||
}
|
||
|
||
try:
|
||
response = requests.post(url, headers=headers, json=payload, timeout=10)
|
||
response.raise_for_status()
|
||
data = response.json()
|
||
|
||
# 检查返回数据结构
|
||
if "data" in data and data["data"]:
|
||
return data["data"]
|
||
else:
|
||
return []
|
||
|
||
except Exception as e:
|
||
print(f"抓取第 {page_num} 页数据出错: {e}")
|
||
return []
|
||
|
||
def parse_and_print(data):
|
||
"""
|
||
解析并打印公告信息
|
||
|
||
参数:
|
||
data (list): fetch_szse_disclosures 返回的公告列表
|
||
"""
|
||
# 打印表头
|
||
print(f"{'时间':<20} {'股票':<10} {'标题':<40} {'文档ID':<20} {'链接'}")
|
||
print("-" * 120)
|
||
|
||
for item in data:
|
||
# 提取字段
|
||
publish_time = item.get("publishTime", "")
|
||
sec_code = item.get("secCode", "")
|
||
sec_name = item.get("secName", "")
|
||
title = item.get("title", "")
|
||
doc_id = item.get("id", "")
|
||
|
||
# 构造详情页链接
|
||
# 也可以构造直接下载PDF的链接: http://disc.static.szse.cn/download + attachPath
|
||
link = f"http://www.szse.cn/disclosure/listed/bulletinDetail/index.html?{doc_id}"
|
||
|
||
# 清理标题中的HTML标签 (如高亮标签)
|
||
title = title.replace('<span class="highlight">', '').replace('</span>', '')
|
||
|
||
# 打印单行数据
|
||
print(f"{publish_time:<20} {sec_name}({sec_code}) {title[:40]:<40} {doc_id:<20} {link}")
|
||
|
||
if __name__ == "__main__":
|
||
# 配置
|
||
stock = "002003" # 伟星股份
|
||
lookback_days = 180 # 回溯最近180天 (半年)
|
||
|
||
print(f"正在抓取 {stock} 最近 {lookback_days} 天的公告...")
|
||
|
||
all_results = []
|
||
page = 1
|
||
# 计算截止日期
|
||
cutoff_date = datetime.now() - timedelta(days=lookback_days)
|
||
stop_fetching = False
|
||
|
||
while not stop_fetching:
|
||
print(f"正在获取第 {page} 页...")
|
||
results = fetch_szse_disclosures(stock, page)
|
||
|
||
if not results:
|
||
print("该页无数据或已到达末尾。")
|
||
break
|
||
|
||
for item in results:
|
||
pub_time_str = item.get("publishTime")
|
||
if pub_time_str:
|
||
# 时间格式通常为 'YYYY-MM-DD HH:MM:SS'
|
||
try:
|
||
pub_time = datetime.strptime(pub_time_str, "%Y-%m-%d %H:%M:%S")
|
||
|
||
# 如果公告发布时间早于截止日期,则停止抓取
|
||
if pub_time < cutoff_date:
|
||
stop_fetching = True
|
||
break # 跳出当前页循环
|
||
|
||
all_results.append(item)
|
||
except ValueError:
|
||
# 如果日期格式解析失败,默认保留该条目
|
||
all_results.append(item)
|
||
|
||
if stop_fetching:
|
||
print("已达到时间限制,停止抓取。")
|
||
break
|
||
|
||
page += 1
|
||
time.sleep(1) # 礼貌延时,避免请求过快
|
||
|
||
if all_results:
|
||
print(f"\n共获取到 {len(all_results)} 条公告:")
|
||
parse_and_print(all_results)
|
||
else:
|
||
print("未找到符合条件的公告数据。")
|