1. 程式人生 > >廣州商學院新聞獲取

廣州商學院新聞獲取

AR start -c sts htm getc href __main__ hit

import re
import xlwt
import time
import pandas
import requests
from multiprocessing import Process,Pool
from bs4 import BeautifulSoup


def getClickCount(newUrl):

    """
    獲取新聞的點擊次數
    :param newUrl:
    :return: int
    """
    new_id = re.findall(r‘\_(.*).html‘,newUrl)
    new_id = new_id[0].split(‘/‘)[1]
    url = ‘http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80‘.format(new_id)
    content = requests.get(url)
    clickCount = int(re.search("hits‘\).html\(‘(.*)‘\);", content.text).group(1))
    return clickCount

def getNewDetail(newsUrl):

    """
    獲取廣州商學院的新聞詳情
    :param newsUrl:
    :return: Dict
    """
    content=‘‘
    web=requests.get(newsUrl)
    web.encoding=‘utf-8‘
    soup=BeautifulSoup(web.text, ‘html.parser‘)
    structure=soup.find(‘div‘,{‘class‘:‘show-content‘}) #正文
    for string in structure.stripped_strings:
        content=content+string

    list=[]
    info=soup.find(‘div‘,{‘class‘:‘show-info‘})
    info=info.text.replace(‘\xa0‘,‘n‘).split(‘n‘)#細節信息
    for string in info:
        if len(string)>3:
            if string.find(‘發布時間‘)!=-1:
                string=string.replace(‘:‘,‘:‘,1)
                string=string.strip()
            if string.find(‘次‘)!=-1:
                string=‘點擊:{}次‘.format(getClickCount(newsUrl))

            list.append(string.split(‘:‘))
    list=dict(list)
    list[‘鏈接‘]=newsUrl
    list[‘正文‘]=content
    list[‘發布時間‘]=time.strptime(list[‘發布時間‘],‘%Y-%m-%d %H:%M:%S‘)
    return list
def getNewsUrl(url):

    """
    獲取廣州商學院新聞列表頁的所有新聞頁的鏈接
    :param url:
    :return: List
    """

    newsList=[]
    web=requests.get(url)
    web.encoding=‘utf-8‘

    soup=BeautifulSoup(web.text,‘html.parser‘)
    soup=soup.find(‘ul‘,{‘class‘:‘news-list‘})
    for child in soup.children:
        if len(child)>1:
            newsList.append(child.a[‘href‘])
    return newsList

def getPage(url):

    """
    獲取廣州商學院新聞頁數
    :param url:
    :return: int
    """
    web=requests.get(url)
    web.encoding=‘utf-8‘

    soup=BeautifulSoup(web.text,‘html.parser‘)
    soup=soup.find(‘a‘,{‘class‘:‘a1‘}).string[:-1]

    page=int(soup)//10+1

    return page

def getnews(url):
    print(‘start in %s‘%url[39:])
    newsurllist = getNewsUrl(url)
    for url in newsurllist:
        news.append(getNewDetail(url))
    print(‘ end ‘ ,end=‘‘)

if __name__==‘__main__‘:

    news=[]

    url=‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘
    newsurl=getNewsUrl(url)
    page=getPage(url)
    for i in range(1,page+1):
        if i==1:
            url=‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘
        else:
            url=‘http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html‘.format(i)
        getnews(url)
    df=pandas.DataFrame(news)
    df.to_excel(‘gzccnews.xls‘)

  

廣州商學院新聞獲取