1. 程式人生 > 實用技巧 >html parser html解析器 C語言 其他語言也有介面 java

html parser html解析器 C語言 其他語言也有介面 java

下載:
git clone https://github.com/google/gumbo-parser.git

預先安裝gcc等
sudo apt-get install libtool

$cd gumbo-parser/
$ ./autogen.sh
$ ./configure
$ make
$ sudo make install

  • 例項程式碼在examples下。make時會自動生成在gumbo-parser/目錄下。

注意所以操作都在gumbo-parser/目錄下。

自己可以修改示例重新生成。在gumbo-parser/目錄下執行 make 程式名(不要字尾cc)。比如在examples/find_links.cc, 重新編譯用 make find_links

即可。生成的可執行檔案在根目錄下。

  • 自己整合編譯的話,配置資訊可以用命令pkg-config打出:

$ pkg-config --cflags --libs gumbo  

$ gcc my_program.c `pkg-config --cflags --libs gumbo`

整合gtest也可以。

https://github.com/google/gumbo-parser

修改成遍歷出所有文字節點:

// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // limitations under the License. // // Author: [email protected] (Jonathan Tang) // // Finds the URLs of all links in the page. #include <stdlib.h> #include <fstream> #include <iostream> #include <string> #include "gumbo.h" static void search_for_links(GumboNode* node) { if (node->type != GUMBO_NODE_ELEMENT) { return; } GumboAttribute* href; if (node->v.element.tag == GUMBO_TAG_A && (href = gumbo_get_attribute(&node->v.element.attributes, "href"))) { std::cout << href->value << std::endl; } GumboVector* children = &node->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { search_for_links(static_cast<GumboNode*>(children->data[i])); } } static void search_for_text(GumboNode* node) { if (node->type == GUMBO_NODE_TEXT) { std::cout << node->v.text.text << std::endl; } if (node->type == GUMBO_NODE_ELEMENT|| node->type == GUMBO_NODE_DOCUMENT|| node->type == GUMBO_NODE_TEMPLATE) { if(node->type == GUMBO_NODE_TEMPLATE){ std::cout << "=== GUMBO_NODE_TEMPLATE ===" << std::endl; } if(node->type == GUMBO_NODE_DOCUMENT){ std::cout << "=== GUMBO_NODE_DOCUMENT ===" << std::endl; } GumboVector* children = &node->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { search_for_text(static_cast<GumboNode*>(children->data[i])); } } } int main(int argc, char** argv) { if (argc != 2) { std::cout << "Usage: find_links <html filename>.\n"; exit(EXIT_FAILURE); } const char* filename = argv[1]; std::ifstream in(filename, std::ios::in | std::ios::binary); if (!in) { std::cout << "File " << filename << " not found!\n"; exit(EXIT_FAILURE); } std::string contents; in.seekg(0, std::ios::end); contents.resize(in.tellg()); in.seekg(0, std::ios::beg); in.read(&contents[0], contents.size()); in.close(); GumboOutput* output = gumbo_parse(contents.c_str()); //search_for_links(output->root); search_for_text(output->root); gumbo_destroy_output(&kGumboDefaultOptions, output); }