URL_SHARE = 'http://yun.baidu.com/pcloud/feed/getsharelist?auth_type=1&start={start}&limit=20&query_uk={uk}&urlid={id}'
http://yun.baidu.com/pcloud/friend/getfollowlist?query_uk={uk}&limit=20&start={start}&urlid={id}' #http://yun.baidu.com/pcloud/friend/getfanslist?query_uk=1327787586&limit=25&start=0 URL_FANS = 'http://yun.baidu.com/pcloud/friend/getfanslist?query_uk={uk}&limit=20&start={start}&urlid={id


爬蟲分三步,一個是urlids 儲存要爬取的網址,一個是user存放使用者uk,另一個是share存放user分享的資料,包含任何你想要的資料。


def response_worker():
	global news,totals
	dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'baiduyun', charset='utf8')
	dbcurr = dbconn.cursor()
	dbcurr.execute('SET NAMES utf8')
	dbcurr.execute('set global wait_timeout=60000'
) while True: print "function response_worker",hc_r.qsize() # if hc_r.qsize()==0: # print "continue" # continue metadata, effective_url = hc_r.get() print "response_worker:", effective_url try: tnow = datetime.datetime.utcnow() date = (tnow + datetime.timedelta(hours=8)) date = datetime.datetime(date.year, date.month, date.day) if news>=100: try: dbcurr.execute('INSERT INTO spider_statusreport(date,new_hashes,total_requests) VALUES(%s,%s,%s) ON DUPLICATE KEY UPDATE ' +'total_requests=total_requests+%s,new_hashes=new_hashes+%s', (date, news,totals,totals,news)) except Exception as ex: print "E10", str(ex) news=0 id = re_urlid.findall(effective_url)[0] start = re_start.findall(effective_url)[0] if True: if 'getfollowlist' in effective_url: #type = 1 follows = json.loads(metadata) print "-------------------------------------follows-------------------------------\n" uid = re_uid.findall(effective_url)[0] if "total_count" in follows.keys() and follows["total_count"]>0 and str(start) == "0": for i in range((follows["total_count"]-1)/ONEPAGE): try: dbcurr.execute('INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 1, 0)' % (uid, str(ONEPAGE*(i+1)), str(ONEPAGE))) except Exception as ex: print "E1", str(ex) pass if "follow_list" in follows.keys(): for item in follows["follow_list"]: if item['pubshare_count']==0: print "---------------------count ==0-------------------------------------------\n" #continue y = dbcurr.execute('SELECT id FROM user WHERE userid=%s', (item['follow_uk'],)) y = dbcurr.fetchone() print "user uk",item['follow_uk'] if not y: try: dbcurr.execute('INSERT INTO user(userid, username, files, status, downloaded, lastaccess,avatar_url,fans_count,follow_count,album_count) VALUES(%s, "%s", %s, 0, 0, "%s","%s",%s,%s,%s)' % (item['follow_uk'], item['follow_uname'],item['pubshare_count'],tnow,item['avatar_url'],item['fans_count'],item['follow_count'],item['album_count'])) except Exception as ex: print "E13", str(ex) pass else: print "-----------------userid exists---------------------------------\n" else: print "delete 1", uid, start dbcurr.execute('delete from urlids where uk=%s and type=1 and start>%s' % (uid, start)) elif 'getfanslist' in effective_url: #type = 2 fans = json.loads(metadata) print "----------------------------------------fans----------------------------------\n" uid = re_uid.findall(effective_url)[0] if "total_count" in fans.keys() and fans["total_count"]>0 and str(start) == "0": for i in range((fans["total_count"]-1)/ONEPAGE): try: dbcurr.execute('INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 2, 0)' % (uid, str(ONEPAGE*(i+1)), str(ONEPAGE))) except Exception as ex: print "E2", str(ex) pass if "fans_list" in fans.keys(): for item in fans["fans_list"]: if item['pubshare_count']==0: print "---------------------count ==0-------------------------------------------\n" #continue y = dbcurr.execute('SELECT id FROM user WHERE userid=%s', (item['fans_uk'],)) y = dbcurr.fetchone() print "user uk",item['fans_uk'] if not y: try: dbcurr.execute('INSERT INTO user(userid, username, files, status, downloaded, lastaccess,avatar_url,fans_count,follow_count,album_count) VALUES(%s, "%s", %s, 0, 0, "%s","%s",%s,%s,%s)' % (item['fans_uk'], item['fans_uname'],item['pubshare_count'],tnow,item['avatar_url'],item['fans_count'],item['follow_count'],item['album_count'])) except Exception as ex: print "E23", str(ex) pass else: print "-----------------userid exists---------------------------------\n" else: print "delete 2", uid, start dbcurr.execute('delete from urlids where uk=%s and type=2 and start>%s' % (uid, start)) else: shares = json.loads(metadata) print "shares" uid = re_uid.findall(effective_url)[0] totals+=1 if "total_count" in shares.keys() and shares["total_count"]>0 and str(start) == "0": for i in range((shares["total_count"]-1)/ONESHAREPAGE): try: dbcurr.execute('INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 0, 0)' % (uid, str(ONESHAREPAGE*(i+1)), str(ONESHAREPAGE))) except Exception as ex: print "E3", str(ex) pass if "records" in shares.keys(): for item in shares["records"]: print "-------------------------------------filename------------------ ",item['title'] print "---------------------------------------------------------------\n" try: stamp_t=int(item["feed_time"])/1000 t= time.localtime(int(stamp_t)) share_time=time.strftime("%Y-%m-%d %H:%M:%S",t) urls="" if "shorturl" in item.keys(): urls=item['shorturl'] news+=1 length="" if "filelist" in item.keys(): length=str(item['filelist'][0]['size']) dbcurr.execute('INSERT INTO share(fid,userid, filename, shareid, status,filetype,share_time,create_time,urls,down,length) VALUES("%s",%s, "%s", %s, 0,"%s","%s","%s","%s",0,"%s")' % (sid(int(item['shareid'])),uid, item['title'], item['shareid'],get_category(get_ext(item['title'])),share_time,tnow,urls,length)) # time.sleep(10) except Exception as ex: print "\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>E33\n", str(ex) print "item ---------------------------------------------\n" # time.sleep(10) pass else: print "delete 0", uid, start dbcurr.execute('delete from urlids where uk=%s and type=0 and start>%s' % (uid, str(start))) dbcurr.execute('delete from urlids where id=%s' % (id, )) dbconn.commit() except Exception as ex: print "E5", str(ex), id pid = re_pptt.findall(effective_url) if pid: print "pid>>>", pid ppid = int(pid[0]) PROXY_LIST[ppid][6] -= 1 dbcurr.close() dbconn.close() def worker(k): global success, failed dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'baiduyun', charset='utf8') dbcurr = dbconn.cursor() dbcurr.execute('SET NAMES utf8') dbcurr.execute('set global wait_timeout=60000') while True: #dbcurr.execute('select * from urlids where status=0 order by type limit 1') dbcurr.execute('select * from urlids where status=0 limit %s,1'%(str(k),)) d = dbcurr.fetchall() #print d if d: id = d[0][0] uk = d[0][1] start = d[0][2] limit = d[0][3] type = d[0][4] dbcurr.execute('update urlids set status=1 where id=%s' % (str(id),)) url = "" if type == 0: url = URL_SHARE.format(uk=uk, start=start, id=id).encode('utf-8') elif type == 1: url = URL_FOLLOW.format(uk=uk, start=start, id=id).encode('utf-8') elif type == 2: url = URL_FANS.format(uk=uk, start=start, id=id).encode('utf-8') if url: hc_q.put((type, url)) if len(d)==0: print "\ndata user uk\n " dbcurr.execute('select * from user where status=0 limit %s,100'%(str(k*100),)) print "user " d = dbcurr.fetchall() #print "uk",d if d: for item in d: try: print "update user",item[1] dbcurr.execute('insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 0, 0)' % (item[1], str(ONESHAREPAGE))) dbcurr.execute('insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 1, 0)' % (item[1], str(ONEPAGE))) dbcurr.execute('insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 2, 0)' % (item[1], str(ONEPAGE))) dbcurr.execute('update user set status=1 where userid=%s and id=%s' % (item[1],item[6])) except Exception as ex: print "E6", str(ex) else: time.sleep(1) dbconn.commit() dbcurr.close() dbconn.close() def req_worker(inx): s = requests.Session() while True: time.sleep(1) req_item = hc_q.get() req_type = req_item[0] url = req_item[1] try: r = s.get(url) hc_r.put((r.text, url)) except: pass for item in range(3): t = threading.Thread(target = req_worker, args = (item,)) t.setDaemon(True) t.start() for item in range(2): s = threading.Thread(target = worker, args = (item,)) s.setDaemon(True) s.start() for item in range(2): t = threading.Thread(target = response_worker, args = ()) t.setDaemon(True) t.start() while 1: pass



 最近百度雲盤不知道為啥不提供資源檢索,正好最近看了一下python,正好來練練手,寫歌爬蟲爬一下百度雲盤的資源。 分析了一下百度雲盤的網友原始碼和js檔案,裡面有大量ajax的東西,利用json傳輸資料,前端顯示。話說,這樣資料爬去就方便多了,也不要用scrapy啥的


點選它,再點選右邊的【Cookies】就可以看到請求頭裡的 cookie 情況。cookie分析除了上面說到的兩個 cookie ,其他的請求頭引數可以參照手動轉存時抓包的請求頭。這兩個 cookie 預留出來做引數的原因是 cookie 都是有生存週期的,過期了需要更新,不同的賬號登入也有不同的 cooki


> 覺得有幫助的別忘了關注一下知識圖譜與大資料公眾號 ## 開始 在上一文中,我們儲存了百度雲盤的地址和提取碼,但是這種分享連結很容易被遮蔽,最好的做法就是儲存資源到自己的網盤,不過採集的連結有上萬個,人肉儲存並不現實,所以本文嘗試了批量儲存資源,如您還沒看過上文,這裡可以跳轉。 [爬蟲學習3:搭建自


不知道大家有沒有這樣的煩惱,百度雲盤的內容很多,有時候找不到,甚至壓根兒不知道里面有什麼,可能裡面藏了很多的寶藏自己都忘了。 下面告訴大家一個辦法可以快速的匯出自己網盤的內容目錄,方便自己瀏覽。 總共分三步:   第一步:找到本地百度資料檔案;   第二步:匯出自己需要的資料;   第三步:處理資料; 是不是


首先,我這裡有一份相關城市以及該城市的公園數量的txt檔案: 其次,利用百度地圖API提供的介面爬取城市公園的相關資訊。 所利用的API介面有兩個: 1、http://api.map.baidu.com/place/v2/search?q=公園&


web上傳檔案的功能一般有兩種方式: 1. 普通上傳:將本地檔案的路徑作為一個值放在input標籤中,通過form表單將這個值提交給伺服器 2. 外掛上傳:一般指基於Flash,Javascript,或者Ajax等技術實現的上傳功能 下面舉例說明自動化百度


