用pyquery抓取分析北京房地產成交資料
阿新 • • 發佈:2018-12-23
#!/usr/bin/env python
#coding:gbk
#author:周海漢
#note:分析北京住房和城鄉建設委員會每天房產成交資料
import urllib2
import sys
import os
import datetime
import time
import shutil
from pyquery import PyQuery as pq
fn = "1.txt"
fnwork = "roominfo.csv"
lockfile = "lock.txt"
html = ""
def hasfetch():
old=datetime.datetime.now ()-datetime.timedelta(days =1)
#print old
tnow=datetime.datetime.now()
print "NOW :",tnow
try:
t = open(lockfile,'r').read()
print "LAST FETCH TIME:",t
date = time.strptime(t[:19],"%Y-%m-%d %H:%M:%S")
#print date
old = datetime.datetime(date [0], date[1],date[2])
print old
#print (tnow-old).days > 0 and False or True
#old = datetime.datetime(t)
except Exception,e:
print e
return False
if (tnow-old).days > 0 :
return False
return True
if hasfetch():
print 'has crawled the room info, DO NOTHING!!!!'
exit(0)
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
url = "http://www.bjjs.gov.cn/tabid/2167/Default.aspx"
url = "http://www.bjjs.gov.cn/tabid/2207/Default.aspx"
#for chinese url
#p=pq(u'<td width="145" align="left">第A01版:<strong>今日一版</strong></td>')('td')
#print p.children().text().encode('gbk')
#if os.path.isfile(fn):
# html = open(fn,'rb').read()
#else:
try:
url=url.encode('utf8')
url=urllib2.unquote(url)
#req = urllib2.Request(url)
f = urllib2.urlopen(url)
html = f.read()
type = sys.getfilesystemencoding()
html1 = html.decode("UTF-8").encode(type)
#print html1
p = pq(html.decode("UTF-8"))
#print p.children().text().encode(type)
#print p("#ess_ctr5233_ModuleContent").text().encode(type)
data = p("#ess_ctr5233_ModuleContent")
table1=data.find("table")
#print t1.text().encode(type)
#print table1.eq(0).text().encode(type)
txt = ''
for i in range(1,4):
txt += table1.eq(i).text().replace(' ',',').encode(type)
txt += "n"
#print txt
w = open(fn,"a+")
w.write(txt)
w.close()
shutil.copyfile(fn,fnwork)
print 'sucessfull!'
l = open(lockfile,"w+")
l.write(str(datetime.datetime.now()))
l.close()
#d=pq(filename=fn)
#print d.html().encode('utf8')
#print d.text().encode('utf8')
except Exception,e:
print 'except'
print e