Python 新浪實時新聞爬蟲
阿新 • • 發佈:2019-01-07
''' Python 新浪實時新聞爬蟲 by 鄭瑞國 ''' import re import urllib.request def open_url(url): return urllib.request.urlopen(url).read().decode("utf-8","ignore") def find_url(url): return re.findall('href="(http://.*?)"',open_url(url)) def find_text(url): return re.findall('<a.*>(.*?)</a>',open_url(url)) def save_text(text): t=[] try: with open(r'd:\test.txt','r') as pre_f: t = pre_f.readlines() except: pass with open(r'd:\test.txt','a') as f: for i in range(0,len(text)): if len(text[i])>8: if text[i]+'\n' not in t: try: f.write(text[i]+"\n") print(text[i]) except: pass #print('*',end=' ') if __name__ == "__main__": url = 'http://news.sina.com.cn/' url_list = find_url(url) while True: for c_url in url_list: #urllib.request.urlretrieve(c_url,filename=r'd:\tmp\abc.html') text = find_text(c_url) save_text(text)