多執行緒快速抓取網頁
阿新 • • 發佈:2019-02-10
一段簡單的程式碼,用於抓取wiki百科資料,簡單的多執行緒程式設計例子,很少佔記憶體,執行緒數開大了後效率很高。
import sys, thread, threading, time; import commands finish_num = 0; mutex = threading.Lock(); def extract_qid(id, num_of_thread): try: fin = open(sys.argv[1], "r"); fout = open(sys.argv[2] + ".part" + str(id), "w"); count = 0; for line in fin: try: line = line.strip(); if count % num_of_thread != id: count += 1; continue; count += 1; _raw_query = line; cmd = "wget \"zh.wikipedia.org/zh-hans/${query}\" -O \"fetch_wiki/tmp_search_${id}\"" cmd = cmd.replace("${query}", _raw_query).replace("${id}", str(id)); commands.getoutput(cmd); tmp_fin = open("fetch_wiki/tmp_search_${id}".replace("${id}", str(id)), "r"); fout.write("zh.wikipedia.org/zh-hans/${query}\n".replace("${query}",_raw_query)) for tmp_line in tmp_fin: fout.write(tmp_line) tmp_fin.close(); commands.getoutput("rm -f \"fetch_wiki/tmp_search_${id}\"".replace("${id}", str(id))); except: continue; fout.close(); fin.close(); global finish_num; if mutex.acquire(1): finish_num += 1; mutex.release(); return True; except Exception as e: print e; return False; for i in range(0, int(sys.argv[3])): thread.start_new_thread(extract_qid, (i, int(sys.argv[3]))); while finish_num != int(sys.argv[3]): time.sleep(1);