上篇文章,寫到編譯gumbo成功,接下來測試一下gumbo提供的API如何運行
https://github.com/google/gumbo-parser#gumbo---a-pure-c-html5-parser中有一個簡單的實例程序
實例一
#include "gumbo.h"
int main(int argc, char** argv) {
GumboOutput* output = gumbo_parse(argv[1]);
// Do stuff with output->root
gumbo_destroy_output(&kGumboDefaultOptions, output);
}
將文件保存爲mygb.c保存在/usr/test目錄下
將頭文件gumbo.h保存在/usr/incude/gumbo目錄下
將此前編譯好的靜態庫文件libgumbo.a保存在同級目錄中,如該目錄 /usr/test/libgumbo.a
編譯gcc -I /usr/include/gumbo mygb.c -o mygb.exe /usr/test/libgumbo.a -lpthread
huareal@gpx /usr/test
# gcc -I /usr/include/gumbo mygb.c -o mygb.exe /usr/test/libgumbo.a -lpthread
然後執行
huareal@gpx /usr/test
# ./mygb.exe
Segmentation fault (core dumped)
執行有錯誤
測試二
嘗試編譯gumbo自帶的實例程序
gumbo\examples下面有幾個實例
clean_text.cc
find_links.cc
get_title.c
positions_of_class.cc
首先分析get_title.c
嘗試編譯
huareal@gpx /usr/test
# gcc -I /usr/include/gumbo get_title.c -o gettitle.exe /usr/test/libgumbo.a -lpthread
編譯成功
寫一個one.html
<html>
<head>
<title>Hello,gumbo</title>
</head>
<body>
<h1>Test Gumbo</h1>
</body>
</html>
保存在/usr/test當前目錄下
然後執行
huareal@gpx /usr/test
# ./gettitle.exe one.html
Hello,gumbo
執行成功
分析代碼
a:主函數
int main(int argc, const char** argv) {
if (argc != 2) {
printf("Usage: get_title <html filename>.\n");
exit(EXIT_FAILURE);
}
const char* filename = argv[1];
FILE* fp = fopen(filename, "r");//打開文件
if (!fp) {
printf("File %s not found!\n", filename);
exit(EXIT_FAILURE);
}
char* input;
int input_length;
read_file(fp, &input, &input_length);//讀取文件內容
GumboOutput* output = gumbo_parse_with_options(
&kGumboDefaultOptions, input, input_length); //解析html內容
const char* title = find_title(output->root);//獲取html的中的title
printf("%s\n", title);
gumbo_destroy_output(&kGumboDefaultOptions, output);
free(input);
}
//讀取文件內容,簡單的c實現
static void read_file(FILE* fp, char** output, int* length) {
struct stat filestats;
int fd = fileno(fp);
fstat(fd, &filestats);
*length = filestats.st_size;
*output = malloc(*length + 1);
int start = 0;
int bytes_read;
while ((bytes_read = fread(*output + start, 1, *length - start, fp))) {
start += bytes_read;
}
}
可以進一步分析
gumbo_parse_with_options
//查看title的實現,針對節點樹的遍歷
static const char* find_title(const GumboNode* root) {
assert(root->type == GUMBO_NODE_ELEMENT);
assert(root->v.element.children.length >= 2);
const GumboVector* root_children = &root->v.element.children;
GumboNode* head = NULL; //首先獲取head節點
int rootChileLength=root_children->length;
int i;
for (i = 0; i <rootChileLength; ++i) {
GumboNode* child = root_children->data[i];
if (child->type == GUMBO_NODE_ELEMENT &&
child->v.element.tag == GUMBO_TAG_HEAD) {
head = child;
break;
}
}
assert(head != NULL);
GumboVector* head_children = &head->v.element.children;
int j;
for (j = 0; j < head_children->length; ++j) {
GumboNode* child = head_children->data[j];
if (child->type == GUMBO_NODE_ELEMENT &&
child->v.element.tag == GUMBO_TAG_TITLE) { //獲取TITLE
if (child->v.element.children.length != 1) {
return "<empty title>";
}
GumboNode* title_text = child->v.element.children.data[0];
assert(title_text->type == GUMBO_NODE_TEXT); //獲取NODE_TEXT
return title_text->v.text.text;
}
}
return "<no title found>";
}
先到這裏,明天繼續分析。