79 lines
2.6 KiB
Python
79 lines
2.6 KiB
Python
# data_fetcher.py
|
|
import akshare as ak
|
|
import pandas as pd
|
|
import datetime
|
|
|
|
def fetch_stock_data():
|
|
"""获取股票数据"""
|
|
print("正在获取股票数据...")
|
|
stock_00981 = ak.stock_hk_daily(symbol="00981")
|
|
stock_01347 = ak.stock_hk_daily(symbol="01347")
|
|
|
|
print("中芯国际数据列名:", stock_00981.columns.tolist())
|
|
print("华虹半导体数据列名:", stock_01347.columns.tolist())
|
|
|
|
return stock_00981, stock_01347
|
|
|
|
def preprocess_data(df, symbol):
|
|
"""预处理股票数据"""
|
|
df = df.copy()
|
|
# 检查列名并重命名(如果需要)
|
|
if 'date' in df.columns:
|
|
df['date'] = pd.to_datetime(df['date'])
|
|
df.set_index('date', inplace=True)
|
|
elif '日期' in df.columns:
|
|
df['date'] = pd.to_datetime(df['日期'])
|
|
df.set_index('date', inplace=True)
|
|
# 重命名中文列为英文
|
|
rename_dict = {
|
|
'开盘': 'open',
|
|
'最高': 'high',
|
|
'最低': 'low',
|
|
'收盘': 'close',
|
|
'成交量': 'volume'
|
|
}
|
|
df = df.rename(columns=rename_dict)
|
|
else:
|
|
# 如果已经有英文列名,直接使用
|
|
df.index = pd.to_datetime(df.index)
|
|
|
|
df = df.sort_index()
|
|
return df[['open', 'high', 'low', 'close', 'volume']]
|
|
|
|
def get_processed_data():
|
|
"""获取并处理数据"""
|
|
# 获取原始数据
|
|
stock_00981, stock_01347 = fetch_stock_data()
|
|
|
|
# 预处理数据
|
|
smic_data = preprocess_data(stock_00981, "00981")
|
|
hhic_data = preprocess_data(stock_01347, "01347")
|
|
|
|
print(f"中芯国际原始数据时间范围: {smic_data.index.min()} 到 {smic_data.index.max()}")
|
|
print(f"华虹半导体原始数据时间范围: {hhic_data.index.min()} 到 {hhic_data.index.max()}")
|
|
|
|
# 限制为最近一年数据
|
|
end_date = smic_data.index.max()
|
|
start_date = end_date - pd.Timedelta(days=360)
|
|
|
|
print(f"\n限制回测时间范围: {start_date} 到 {end_date}")
|
|
|
|
smic_data = smic_data.loc[start_date:end_date]
|
|
hhic_data = hhic_data.loc[start_date:end_date]
|
|
|
|
print(f"限制后中芯国际数据形状: {smic_data.shape}")
|
|
print(f"限制后华虹半导体数据形状: {hhic_data.shape}")
|
|
|
|
# 对齐数据时间范围
|
|
common_index = smic_data.index.intersection(hhic_data.index)
|
|
smic_data = smic_data.loc[common_index]
|
|
hhic_data = hhic_data.loc[common_index]
|
|
|
|
print(f"\n对齐后数据时间范围: {smic_data.index.min()} 到 {smic_data.index.max()}")
|
|
print(f"对齐后数据点数: {len(smic_data)}")
|
|
|
|
return smic_data, hhic_data
|
|
|
|
if __name__ == "__main__":
|
|
smic_data, hhic_data = get_processed_data()
|
|
print("数据获取和预处理完成!") |