-
Notifications
You must be signed in to change notification settings - Fork 27
/
web_spiders_tools.py
180 lines (146 loc) · 5.97 KB
/
web_spiders_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# coding=utf-8
import urllib2
from bs4 import BeautifulSoup
import pandas
import numpy as np
import time
# 打开广发择时站点的某个策略的信号页面,将页面中的日期和信号信息下载,解析后存入本地csv文件
def download_GFModelSignal(url):
content = urllib2.urlopen(url).read()
soup = BeautifulSoup(content, "lxml")
soup.prettify()
signalDic = {}
date = ""
signal = ""
for tag in soup.find_all("td"):
if tag.string == None:
for child in tag.children:
if str(child).find("↑") != -1:
signal="up"
elif str(child).find("↓") != -1:
signal="down"
elif str(child).find("↓") != -1:
signal="down"
else:
date = tag.string
if date != "" and signal != "":
return
# 获取所有基金的历史净值信息
def download_fund_data(dir_path):
fund_infos_df = get_all_fund('D:\\OpenSourceTradePlatform\\fund_info.txt')
int
for index, row in fund_infos_df.iterrows(): # 获取每行的index、row
# index = '000076'
if index < '240009':
continue
print('start to download==> ' + index)
start_date, end_date = parse_fund_data(index, dir_path)
fund_infos_df.loc[index, ['start_date']] = start_date
fund_infos_df.loc[index, ['end_date']] = end_date
print('download successful==> ' + index)
file_path = dir_path + 'fund_infos.csv'
fund_infos_df.to_csv(file_path, encoding='gbk', index=False)
# break # 此处break是为了测试的时候在获取完第一个基金的数据后就停止,去掉后即可访问所有的数据了
print 'success to download all fund infos'
print fund_infos_df
# 爬虫程序,根据输入的基金编码,去东方财富基金部分上下载所有的历史数据
# 输入参数fund_cdoe:场外基金代码
# 输入参数dir_path:输出文件路径
def parse_fund_data(fund_code, dir_path):
page_index = 1
page_count = 1
histroy_data = []
while True:
url = "http://fund.eastmoney.com/f10/F10DataApi.aspx?type=lsjz&code=%s&page=%s&per=20&sdate=&edate="%(fund_code, str(page_index))
count = 0
while True:
try:
response = urllib2.urlopen(url, timeout=10.0)
if (response <> None):
break
except:
print('链接错误,重新链接')
time.sleep(5)
if (count >=5):
return "获取超时", "获取超时"
count += 1
content = response.read()
soup = BeautifulSoup(content, "lxml")
soup.prettify()
for tag in soup.find_all("tbody"):
for child in tag.children:
day_data = []
for child2 in child.children:
day_data.append(child2.string)
# print(child2.string)
histroy_data.append(day_data)
if page_index == 1:
for child in soup.body.children:
pos = str(child).find("pages")
if pos != -1:
pos2 = str(child).find(",curpage")
page_count = int(str(child)[pos + 6:pos2])
break
if page_index == page_count:
break
if page_count == 0:
return '', ''
page_index += 1
histroy_data_array = np.array(histroy_data)
# print(histroy_data_array)
if (len(histroy_data[0]) == 7):
fund_data_df = pandas.DataFrame(histroy_data_array, columns=['date', 'net_value', 'total_value', ' growth_rate', 'apply_status', 'redeem_status', 'poundage'])
else:
fund_data_df = pandas.DataFrame(histroy_data_array, columns=['date', 'profit', '7day_yield', 'apply_status', 'redeem_status', 'poundage'])
# print(fund_data_df)
fund_data_df = fund_data_df.sort(columns=['date'], axis=0, ascending=True)
start_date = fund_data_df.iat[0, 0]
end_date = fund_data_df.iat[-1, 0]
file_path = dir_path + fund_code + '.csv'
fund_data_df.to_csv(file_path, encoding='gbk', index=False)
return start_date, end_date
# 通过格式化后的基金列表文件,获取所有的基金列表和编号,并获取基金的手续费信息,用于之后获取所有的基金的历史净值信息
def get_all_fund(file_name):
file_object = open(file_name, 'rb')
try:
all_the_text = file_object.read().decode("gbk")
finally:
file_object.close()
fund_infos = []
fund_codes= []
fund_array = all_the_text.split('\r\n')
for arr in fund_array:
infos = arr.split('@')
# print(arr)
fund_data = []
for i in range(len(infos)):
if i == 0:
# print(infos[i])
fund_data.append(infos[i] + '.FUND')
fund_codes.append(infos[i])
else:
info = infos[i]
fundName = info[:info.index("&")]
# print(fundName)
fund_data.append(fundName)
if not info.endswith('---'):
# print('---')
# fund_data.append(None)
# else:
charge = info[len(info) - 5:]
# print(charge)
fund_data.append(charge)
fund_infos.append(fund_data)
fund_infos_df = pandas.DataFrame(fund_infos, columns=['fund_code', 'fund_name', 'charge'], index=fund_codes)
fund_infos_df = fund_infos_df.sort(columns=['fund_code'], axis=0, ascending=True)
fund_infos_df['start_date'] = None
fund_infos_df['end_date'] = None
print '--------get all fund info-------'
print fund_infos_df
return fund_infos_df
def main():
# download_GFModelSignal("http://218.19.190.27/LLThistory.asp")
download_fund_data('D:\\OpenSourceTradePlatform\\fund_data\\')
# get_all_fund('D:\\OpenSourceTradePlatform\\fund_info.txt')
if __name__=="__main__":
main()