php解析html dom節點樹
阿新 • • 發佈:2019-01-08
不得不感嘆用DOM直接解析HTML DOM樹的靈活和強大,因為基本的HTML元素就是那麼幾種常見的,再加上ID屬性或者CLASS屬性之類的。。
在解析html檔案時,完全可以用正則中脫離出來,畢竟HTML檔案中存在大量相似的模式,而且程式碼看上去功能比較顯而易見,當然正則是非常強大的,應用的領域也更廣。。
程式碼如下:
<?php //關閉載入包含js時的警告提示 error_reporting(E_ERROR | E_PARSE); class DomTree { //DOM控制代碼 private $doc=null; //儲存基本解釋 private $basic_meaning=array(); //儲存英漢雙解 private $en_or_ch=array(); //儲存英英釋義 private $en_to_en=array(); //儲存例句 private $example=array(); //儲存常用句型 private $sentences=array(); //儲存詞彙表 private $glossary=array(); //儲存經典名人名言 private $auth=array(); //儲存常見錯誤用法 private $use_in_wrong = array(); //儲存近義詞 private $approximate_words = array(); //儲存百科解釋 private $baike_trans = array(); public function __construct($source) { $this->doc = new DomDocument(); //判斷$source型別 if(is_file($source)) { file_exists($source)?$this->doc->loadHTMLFile($source):die("檔案不存在"); } else if(is_string($source)) { empty($source)?die("傳入的字串不能為空"):$this->doc->loadHTML($source); } else { preg_match('#^(http|ftp)://#i', $source)?$this->doc->loadHTML(file_get_contents($source)):die("不支援的資源型別"); } //獲取div元素列表 $div_list = $this->doc->getElementsByTagName("div"); $div_list_len = $div_list->length; for($i=0; $i<$div_list_len; $i++) { if($div_list->item($i)->hasAttribute("class")) { switch(trim($div_list->item($i)->getAttribute ("class"))) { case "basic clearfix": $this->getBasicMeans($div_list->item($i)); break; case "layout dual": $this->getEnOrCh($div_list->item($i)); break; case "layout en": $this->getEnToEn($div_list->item($i)); break; case "layout sort": $this->getExample($div_list->item($i)); break; case "layout patt": $this->normalSentence($div_list->item($i)); break; case "layout coll": $this->getGlossary($div_list->item($i)); break; case "layout auth": $this->getAuth($div_list->item($i)); break; case "layout comn": $this->useInWrong($div_list->item($i)); break; case "layout nfw": $this->getApproximateWords($div_list->item($i)); break; case "layout baike"; $this->getBaike($div_list->item($i)); break; } } } } //獲取基本解釋 private function getBasicMeans($basic_div) { $li_list = $basic_div->getElementsByTagName("li"); $li_list_len = $li_list->length; for($i=0; $i<$li_list_len; $i++) { $item = $li_list->item($i); if($item->hasAttribute("style")) { continue; } else { $strong_list = $item->getElementsByTagName("strong"); $strong_list_len = $strong_list->length; for($j=0; $j<$strong_list_len; $j++) { $this->basic_meaning[]=$strong_list->item($j)->nodeValue; } } } } //獲取英漢雙解釋義 private function getEnOrCh($div_elem) { $li_list = $div_elem->getElementsByTagName("li"); $li_list_len = $li_list->length; for($i=0; $i<$li_list_len; $i++) { $this->en_or_ch[]=$li_list->item($i)->nodeValue; } } //獲取英英釋義 private function getEnToEn($div_elem) { $li_list = $div_elem->getElementsByTagName("li"); $li_list_len = $li_list->length; for($i=0; $i<$li_list_len; $i++) { $this->en_to_en[]= $this->strip_Empty($li_list->item($i)->nodeValue); } } //格式化操作 private function strip_Empty($string) { if(is_string($string)) { return preg_replace('#\s{2,}#', ' ', $string); } } //獲取例句 private function getExample($div_elem) { if($div_elem->hasChildNodes()) { $ol_list = $div_elem->getElementsByTagName("ol"); $ol_list_len = $ol_list->length; for($i=0; $i<$ol_list_len; $i++) { $li_list = $ol_list->item($i)->getElementsByTagName("li"); $li_list_len = $li_list->length; for($j=0; $j<$li_list_len; $j++) { $this->example[] = $this->strip_Empty($li_list->item($j)->nodeValue); } } } } //常見句型 private function normalSentence($div_elem) { $ol_list = $div_elem->getElementsByTagName("ol"); $ol_list_len = $ol_list->length; for($i=0; $i<$ol_list_len; $i++) { //獲取英語句型 $li_list = $ol_list->item($i)->getElementsByTagName("li"); $li_list_len = $li_list->length; for($j=0; $j<$li_list_len; $j++) { $this->sentences[]=$this->strip_Empty($li_list->item($j)->nodeValue); } } } //常見詞彙 private function getGlossary($div_elem) { $ul_list = $div_elem->getElementsByTagName("ul"); $ul_list_len = $ul_list->length; for($i=0; $i<$ul_list_len; $i++) { //獲取常見詞彙 $li_list = $ul_list->item($i)->getElementsByTagName("li"); $li_list_len = $li_list->length; for($j=0; $j<$li_list_len; $j++) { $this->glossary[]=$this->strip_Empty($li_list->item($j)->nodeValue); } } } //獲取名人名言 private function getAuth($div_elem) { $ul_list = $div_elem->getElementsByTagName("ul"); $ul_list_len = $ul_list->length; for($i=0; $i<$ul_list_len; $i++) { //獲取列表 $li_list = $ul_list->item($i)->getElementsByTagName("li"); $li_list_len = $li_list->length; for($j=0; $j<$li_list_len; $j++) { $this->auth[]=$this->strip_Empty($li_list->item($j)->nodeValue); } } } //獲取常見錯誤用法 private function useInWrong($div_elem) { $ol_list = $div_elem->getElementsByTagName("ol"); $ol_list_len = $ol_list->length; for($i=0; $i<$ol_list_len; $i++) { //獲取錯誤用法列表 $li_list = $ol_list->item($i)->getElementsByTagName("li"); $li_list_len = $li_list->length; for($j=0; $j<$li_list_len; $j++) { $this->use_in_wrong[]=$this->strip_Empty($li_list->item($j)->nodeValue); } } } //獲取近義詞 private function getApproximateWords($div_elem) { $ul_list = $div_elem->getElementsByTagName("ul"); $ul_list_len = $ul_list->length; for($i=0; $i<$ul_list_len; $i++) { $li_list = $ul_list->item($i)->getElementsByTagName("li"); $li_list_len = $li_list->length; for($j=0; $j<$li_list_len; $j++) { $a_list = $li_list->item($j)->getElementsByTagName("a"); $a_list_len = $a_list->length; for($k=0; $k<$a_list_len; $k++) { $this->approximate_words[]=$a_list->item($k)->nodeValue; } } } } //獲取百科解釋 private function getBaike($div_elem) { $ul_list = $div_elem->getElementsByTagName("ul"); $ul_list_len = $ul_list->length; for($i=0; $i<$ul_list_len; $i++) { //獲取列表 $li_list = $ul_list->item($i)->getElementsByTagName("li"); $li_list_len = $li_list->length; for($j=0; $j<$li_list_len; $j++) { $this->baike_trans[]=$li_list->item($j)->nodeValue; } } } //介面: 返回基本釋義 public function getBasicMeaning() { if(!empty($this->basic_meaning)) { return $this->basic_meaning; } } //介面: 返回英漢雙解 public function getEnOrChMeaning() { if(!empty($this->en_or_ch)) { return $this->en_or_ch; } } //介面: 返回英英釋義 public function getEnToEnMeaning() { if(!empty($this->en_to_en)) { return $this->en_to_en; } } //介面: 返回例句 public function getExampleMeaning() { if(!empty($this->example)) { return $this->example; } } //介面: 返回常用句型 public function getNormalSentenceMeaning() { if(!empty($this->sentences)) { return $this->sentences; } } //介面: 返回詞彙表 public function getGlossaryMeaning() { if(!empty($this->glossary)) { return $this->glossary; } } //介面: 返回名人名言 public function getAuthMeaning() { if(!empty($this->auth)) { return $this->auth; } } //介面: 返回常見錯誤用法 public function getUseInWrongMeaning() { if(!empty($this->use_in_wrong)) { return $this->use_in_wrong; } } //介面: 獲取近義詞 public function getApproximateWordsMeaning() { if(!empty($this->approximate_words)) { return $this->approximate_words; } } //介面: 獲取百度百科的解釋 public function getBaikeMeaning() { if(!empty($this->baike_trans)) { return $this->baike_trans; } } //返回所有的翻譯 public function getAllMeaning() { $all_meaning = array(); $all_meaning['basic_meaning'] = $this->getBasicMeaning(); $all_meaning['en_or_ch'] = $this->getEnOrChMeaning(); $all_meaning['en_to_en'] = $this->getEnToEnMeaning(); $all_meaning['example']=$this->getExampleMeaning(); $all_meaning['normal_sentence'] = $this->getNormalSentenceMeaning(); $all_meaning['glossary_sentence'] = $this->getGlossaryMeaning(); $all_meaning['auth_sentence'] = $this->getAuthMeaning(); $all_meaning['wrong_use'] = $this->getUseInWrongMeaning(); $all_meaning['approximate_words'] = $this->getApproximateWordsMeaning(); $all_meaning['baike_meaning'] = $this->getBaikeMeaning(); return $all_meaning; } } $dom = new DomTree("./com.html"); $trans = $dom->getAllMeaning(); echo "<pre>"; print_r($trans); ?>
結果如下:
Array ( [basic_meaning] => Array ( [0] => 單詞;訊息;話語;諾言 [1] => 用詞語表達 ) [en_or_ch] => Array ( [0] => [C] 字,詞 the smallest unit of spoken language which has meaning and can stand alone [1] => [C] (說的)話,話語,言語 anything said; remark or statement [2] => [S] 訊息,資訊; 謠言 piece of news; message; rumour [3] => [S] 口令,號令; 命令 spoken command or signal [4] => [S] 諾言,保證 a promise [5] => vt. 用詞語表達; 選用 express (sth) in particular words; phrase sth ) [en_to_en] => Array ( [0] => a unit of language that native speakers can identify; "words are the blocks from which sentences are made" "he hardly said ten words all morning" [1] => a brief statement; "he didn't say a word about it" [2] => information about recent and important events; "they awaited news of the outcome" [3] => a verbal command for action; "when I give the word, charge!" [4] => an exchange of views on some topic; "we had a good discussion" "we had a word or two about it" [5] => a promise; "he gave his word" [6] => a word is a string of bits stored in computer memory; "large computers use words up to 64 bits long" [7] => the divine word of God; the second person in the Trinity (incarnate in Jesus) [8] => a secret word or phrase known only to a restricted group; "he forgot the password" [9] => the sacred writings of the Christian religions; "he went to carry the Word to the heathen" [10] => put into words or an expression; "He formulated his concerns to the board of trustees" ) [example] => Array ( [0] => Could we have a word before you go to the meeting? 你去開會之前,咱們能私下說句話嗎? [1] => My friend sent word that he was well. 我朋友捎來口信說他很好。 ) [normal_sentence] => Array ( [0] => What does this word mean? 這個詞是什麼意思? [1] => I couldn't look up the spelling of the word, as I hadn't a dictionary at hand. 我沒法查這個詞的拼寫,因為我手邊沒有詞典。 [2] => Many English words are derived from Latin. 許多英文單詞源於拉丁文。 [3] => All the words beside the central idea should be crossed out. 凡偏離中心思想的詞語都應通通刪掉。 [4] => The editor eliminated slang words from the essay. 編輯將俚語從這篇文章中剔除。 [5] => These words can't be staled by repetition. 這些詞語不會因為經常使用而變成陳詞濫調。 [6] => He gave me his visiting card, with a few words in pencil. 他把他的名片給我,上面有幾個鉛筆字。 [7] => I don't believe a word of his story. 他說的這件事我一句話都不相信。 [8] => At the press conference, the reporters copied down every word spoken by the prime minister. 在新聞釋出會上,記者們逐字記下了首相的講話。 [9] => Tell me what happened in your words. 用你自己的話把發生的事告訴我。 [10] => Deeds are better than words when people are in need of help. 當別人需要幫助時,行動勝於語言。 [11] => I would like a word with you. 我想和你談談。 [12] => After a word with the colonel he went away . 他和上校簡單談過之後就走了。 [13] => There's been no word from her for weeks. 已經有好幾個星期沒有她的音信了。 [14] => Word came that I was needed at home. 有信兒來說家裡需要我。 [15] => Word has come that meeting will be held on Tuesday. 通知已到,星期二開會。 [16] => Word is that the election will be held in June. 有訊息說選舉將在六月份舉行。 [17] => Word is that he's left the country. 據說他已經離開這個國家了。 [18] => Word got round that he had resigned. 謠傳他已辭職。 [19] => Stay hidden until I give the word. 我不下令就藏著別動。 [20] => Their word is law. 他們的命令必須服從。 [21] => He gave the word and they let him in. 他說出了口令,他們讓他進去了。 [22] => The word now is “freedom”. 現在的口號是“自由”。 [23] => I give you my word I'll go. 我向你保證,我會去的。 [24] => Stand by your word. 要守信用。 [25] => Hear The Word of God . 聽宣講《聖經》。 [26] => Be careful how you word your answer. 回答時要斟酌字句。 [27] => She worded the explanation well. 她的解釋措辭得體。 [28] => The advice wasn't very tactfully worded. 這份通知措辭不太得體。 [29] => The suggestion might be worded more politely. 那項建議的措辭可以更婉轉些。 [30] => This is a carefully worded contract. 這是一份措辭嚴謹的合同。 ) [glossary_sentence] => Array ( [0] => address a few words 講幾句話 [1] => await word from sb 等待某人的訊息 [2] => break one's words 食言 [3] => breathe a word 走漏訊息 [4] => bring word 帶來訊息 [5] => choose a word 選擇詞 [6] => coin a word 杜撰一個詞 [7] => cook up words 造新詞 [8] => cross out a word 劃掉一個詞 [9] => cut out many words 刪掉許多詞 [10] => digest a word 消化一個詞 [11] => doubt sb's words 懷疑某人的話 [12] => drink in all the words 吸收所有的詞語 [13] => eat one's words 收回前言,認錯,道歉 [14] => exchange angry words 發生口角 [15] => find words 找出言語(來表達) [16] => gain the good word of 博得…的讚揚 [17] => get word 得到訊息 [18] => get a word 插嘴 [19] => give one's word 保證,允許 [20] => give the word 發出命令 [21] => have words together 爭吵 [22] => have words with sb 與某人吵嘴 [23] => have a word with sb 同某人談一談 [24] => hunt up a word 查一個詞 [25] => keep one's word 信守諾言 [26] => leave word 留言 [27] => leave out a word 省略一個詞,丟掉一個詞 [28] => look up a word (在詞典裡)查一個詞 [29] => memorize words 記單詞 [30] => play on words 玩弄字眼 [31] => pronounce a word 讀一個詞 [32] => put in words for 為…說幾句話 [33] => put the words into sb's mouth 教某人怎麼講 [34] => quote a word 引用一個詞 [35] => receive word of 收到…訊息 [36] => regret one's words 為說過的話而後悔 [37] => respect one's word 遵守自己許下的諾言 [38] => say a word 說句話,進一步,走漏訊息 [39] => say a few words 說幾句話 [40] => say a good word for sb 為某人說好話 [41] => send sb a word 給某人捎個信兒 [42] => spell a word 拼寫一個詞 [43] => stress the word 重讀那個詞 [44] => take back one's word 收回自己的話 [45] => take sb's word for it 相信了某人的話 [46] => understand a word 理解某個詞的意思 [47] => use words 用詞 [48] => waste one's words 白費口舌 [49] => weigh words 斟酌詞句 [50] => write a word 寫一個詞 [51] => advance word 事先傳出的訊息 [52] => angry words 氣話 [53] => beautiful words 優美的言辭 [54] => big words 大話 [55] => borrowed word 外來詞 [56] => broken words 斷斷續續的話 [57] => burning words 熱情洋溢的話 [58] => choice words 精選的詞句 [59] => colorful words 豐富的言辭 [60] => cross words 氣話 [61] => empty words 空洞的話,無意義的話 [62] => everyday word 日常用語 [63] => farewell words 送別詞 [64] => fighting words 容易引起爭論的話,挑戰性的話 [65] => foreign word 外來詞 [66] => hard words 憤怒的話,激烈的話 [67] => heated word 激烈的言詞,爭吵時使用的話 [68] => high words 憤怒的話,激烈的話 [69] => hollow words 虛假的言語 [70] => honeyed words 甜言蜜語 [71] => hot words 激烈的言詞,爭吵時使用的話 [72] => household word 家喻戶曉的詞 [73] => irresponsible words 不負責任的話 [74] => key words 關鍵的字眼 [75] => last words 臨終遺言 [76] => living words 現代語 [77] => meaningful words 意味深長的言語 [78] => meaningless words 無意義的話 [79] => misspelled word 拼錯的詞 [80] => native word 本國詞,本地詞 [81] => pleasant words 動聽的語言 [82] => regional word 方言 [83] => scientific word 科學用語 [84] => semi-technical words 半科技詞 [85] => sharp words 憤怒的話,激烈的話 [86] => simple word 簡單的詞 [87] => sincere words 真誠的話 [88] => small word 小詞 [89] => spoken words 口頭語 [90] => suggestive words 含蓄的話 [91] => sweet words 甜言蜜語 [92] => tearful parting words 傷感的離別之言 [93] => the latest word 最新訊息,最後訊息 [94] => uncleanly words 下流話 [95] => unfamiliar word 生詞 [96] => unusual word 冷僻詞 [97] => warm words 忿怒的話,激烈的話 [98] => written words 書面語 [99] => wrong words 錯詞 [100] => dictionary word 詞典裡出現的詞 [101] => English words 英語單詞 [102] => law word 法律用語 [103] => newspaper word 新聞用語 [104] => slang word 俚語 [105] => at a word 立即,立刻 [106] => in a word 簡言之,總之 [107] => in one's own words 用自己的話說 [108] => in other words 換言之 [109] => upon my word 的確,真的 [110] => without a word 一聲沒吭 [111] => word in heavy type 黑體字 [112] => words in season 時宜的話 [113] => words of comfort 安慰的話 [114] => words of command 命令 [115] => words of complaint 怨言 [116] => the W- of God 聖經 [117] => words of praise 表揚的話 [118] => word of six letters 六個字母的詞 [119] => words of thanks 感謝的話 [120] => word the explanation 解釋 [121] => word accurately 準確地用言語表達 [122] => word crudely 簡單地用詞語〔語言〕表達 [123] => word felicitously 恰當地用言語表達 [124] => word intelligibly 清楚地用語言表達 [125] => word positively 明確地用詞語表達 [126] => word vaguely 含糊地表達 [127] => word well 措辭得體 ) [auth_sentence] => Array ( [0] => Rome shall perishswrite that word In the blood that she has spilt. 出自:W. Cowper [1] => We have striven..to draw some word from her; but she..answers nothing. 出自:G. P. R. James [2] => To use his own words, he was in a cleft stick. 出自:H. Conway [3] => Actions speak louder than words. 出自:Proverb [4] => He words me, girls, he words me, that I should not Be noble to myself. 出自:Anthony Cleopatra,Shakespeare ) [wrong_use] => Array ( [0] => 我要跟他說句話。 誤 I should like to have word with him. 正 I should like to have a word with him. [1] => 他們聽到訊息說足球比賽將在今晚電視實況轉播。 誤 They had a word that the football match would be televised live this evening. 正 They had word that the football match would be televised live this evening. 析 have word是“聽到訊息〔新聞〕”的意思,“說句話”是have a word。 [2] => 對逐詞背課文,我感到厭倦。 誤 I was tired of reciting the texts word after word. 正 I was tired of reciting the texts word for word. 析 “一字不變地,逐字(背誦或翻譯)”是word for word,不是word after word。 [3] => 我說了什麼錯話嗎? 誤 Have I said any wrong words? 正 Have I said anything wrong? 析 誤句語法上沒有錯,但不符合英語習慣。 [4] => 他不遵守諾言。 誤 He broke his words. 正 He broke his word. 析 break one's word意為“不遵守諾言”, word在此短語中不用複數形式。 [5] => 我剛得知他到達的訊息。 誤 I have just received the word of his arrival. 正 I have just received word of his arrival. [6] => 有訊息傳來說我們的籃球隊贏了這場比賽。 誤 The word came that our basketball team had won the match. 正 Word came that our basketball team had won the match. 析 作“訊息”“資訊”解時, word前不加冠詞。 [7] => 他大約是30年前開始當教師的,換句話說,他當教師已經有30年了。 誤 He began to work as a teacher some thirty years ago, in another word, he has been a teacher for thirty years. 正 He began to work as a teacher some thirty years ago, in other words, he has been a teacher for thirty years. 析 in other words是固定短語,意為“換句話說”。 [8] => 他帶信給我說懷特先生不久將動身去美國。 誤 He carried me words that Mr.White would soon leave for America. 正 He carried me word that Mr. White would soon leave for America. 析 word作“訊息”“信”解時,是不可數名詞,其後不可加s。 [9] => 今晨我們爭吵了。 誤 We had a word this morning. 正 We had words this morning. [10] => 他們曾為雞毛蒜皮的小事同鄰居吵過嘴。 誤 They had word with their neighbour over some trifles. 正 They had words with their neighbours over some trifles. 析 表示“同某人發生口角”時,用have words with sb, words用複數形式。 [11] => 他說的大話使我們都感到驚訝。 誤 His big word surprised us all. 正 His big words surprised us all. [12] => 我們絕不收回前言。 誤 We should on no account eat our word. 正 We should on no account eat our words. 析 習語big words, eat one's words中, words詞尾的s不可省。 ) [approximate_words] => Array ( [0] => account [1] => advice [2] => chat [3] => communication [4] => declaration [5] => edict [6] => expression [7] => message [8] => notice [9] => order [10] => password [11] => promise [12] => remark [13] => term [14] => couch [15] => explain [16] => express [17] => phrase [18] => put [19] => say [20] => write ) [baike_meaning] => Array ( [0] => word:Microsoft Word,屬於辦公軟體,人們日常生活都有可能接觸到他,對他並不陌生。 簡介 wordMicrosoft Word是微軟公司的一個文書處理器應用程式。它最初是由Richard Bro… ) )