使用 libxml2 解析 html 网页

创建日期: 2024-10-18 17:35 | 作者: 风波 | 浏览次数: 15 | 分类: C++

项目地址:https://github.com/tenderlove/libxml2 参考代码文件:https://github.com/GNOME/libxml2/blob/master/HTMLparser.c

安装库:apt install libxml2-dev -y

1. 解析 html 内容

1.1 从文件解析

#include <libxml/HTMLparser.h>
#include <libxml/xpath.h>
#include <libxml/xpathInternals.h>

std::string filepath = "abc.html";
xmlDocPtr doc = htmlReadFile(filepath.data(), NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);

或者从打开的文件中(socket)读取

htmlDocPtr htmlReadFd(int fd, const char *url, const char *encoding, int options)

1.2 从内存中解析

#include <libxml/HTMLparser.h>
#include <libxml/xpath.h>
#include <libxml/xpathInternals.h>

std::string content = "<html></html>";
htmlReadMemory(html.data(), html.size(), NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);

1.3 使用 xpath 进行标签检索

#include <libxml/HTMLparser.h>
#include <libxml/xpath.h>
#include <libxml/xpathInternals.h>

std::string filepath = "abc.html";
xmlDocPtr doc = htmlReadFile(filepath.data(), NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);

std::string xname = "//a"; # "//div[@class='ok' and @data='a']" 检索指定属性的 div
xmlXPathContextPtr xpathctx = xmlXPathNewContext(doc);
XmlXPathObject xpathobj = xmlXPathEvalExpression(BAD_CAST xname.data(), xpathctx);

for(int i = 0; i < xpathobj->nodesetval->nodeNr; i++) {
    xmlNode * node = xpathobj->nodesetval->nodeTab[pos];
    xmlChar* prop = xmlGetProp(node_, BAD_CAST name.data());
    std::string value = reinterpret_cast<const char*>(prop);
    xmlFree(prop);

    xmlChar* text = xmlNodeGetContent(node_);
    std::string value = reinterpret_cast<const char*>(text);
    xmlFree(text);
}

xmlXPathFreeObject(xpathobj);
xmlXPathFreeContext(xpathctx);
xmlFreeDoc(doc);
xmlCleanupParser();
15 浏览
9 爬虫
0 评论