本文共 1771 字,大约阅读时间需要 5 分钟。
目标:获取上交所和深交所所有股票的名称和交易信息,保存到文件中
技术路线:使用requests-bs4-re 网站选择:没有robots协议限制;股票信息静态存在于HTML页面中,非JS代码生成 结构设计:通过查看源码的方式定位到需要的字段
#(未测试)import requestsimport refrom bs4 import BeautifulSoupimport tracebackdef getHTMLText(url): try: r = requests.get(url, timeout = 30) r.raise_for_status() r.encoding = r.apparent_encoding #防止中文乱码 #print(r.text) return r.text except: return ""def getStockList(lst, stockURL): html = getHTMLText(stockURL) soup = BeautifulSoup(html, 'html.parser') a = soup.find_all('a') for i in a: try: href = i.attrs['href'] lst.append(re.find_all(r"[s][hz]\d{6}", href)[0])) except: continuedef getStockInfo(lst, stockURL, fpath): for stock in lst: url = stockURL + stock + ".html" html = getHTMLText(url) try: if html == "": continue infoDict = { } soup = BeautifulSoup(html, 'html.parser') stockInfo = sou.find('div', attra={ 'class':'stock-bets'}) name = stockInfo.find_all(attrs={ 'class':'bets-name'})[0] infoDict.update({ '股票名称': name.text.split()[0]}) keyList = stocInfo.find_all('dt') valueList = stockInfo.find_all('dd') for i in range(len(keyList)): key = keyList[i].text val = valueList[i].text infoDict[key] = val def main(): stoc_list_url = 'http://quote.eastmoney.com/stocklist.html' stoc_info_url = 'https://gupiao.baidu.com/stock/' ouput_file = 'D://test.txt' slist = [] getStockList(slist, stock_list_url) getStockInfo(slist, stoc_info_url, ouput_file)main()
转载地址:http://wyhpn.baihongyu.com/