c++17 regex wregex 正则表达

创建日期: 2024-08-30 15:35 | 作者: 风波 | 浏览次数: 11 | 分类: C++

1. char 类型 string 的正则

1.1 match

#include <cstddef>
#include <iostream>
#include <regex>
#include <string>


int main()
{
    // Simple regular expression matching
    const std::string fnames[] = {"foo.txt", "bar.txt", "baz.dat", "zoidberg", "001-abc.txt"};
    const std::string reg = "[a-z]+\\.txt";
    const std::regex txt_regex(reg);

    std::cout << "regex: " << reg << std::endl;
    for (const auto& fname : fnames)
        std::cout << fname << " match: " << (std::regex_match(fname, txt_regex) ? "true" : "false") << '\n';

    std::cout << "----------------" << std::endl;
    // Extraction of a sub-matches
    const std::string r2 = "([a-z]+)\\.txt";
    const std::regex base_regex(r2);
    std::cout << "r2: " << r2 << std::endl;

    for (const auto& fname : fnames) {
        std::cout << fname << std::endl;
        std::smatch base_match;
        if (std::regex_match(fname, base_match, base_regex)) {
            // The first sub_match is the whole string; the next
            // sub_match is the first parenthesized expression.
            //if (base_match.size() == 2)
            //{
            //    std::ssub_match base_sub_match = base_match[1];
            //    //std::string base = base_sub_match.str();
            //    std::string base = base_match[1].str();
            //    std::cout << fname << " has a base of " << base << '\n';
            //}
            for(decltype(base_match.size()) i = 0; i < base_match.size(); i++) {
                std::cout << "  sub match[" << i << "]: " << base_match[i].str() << std::endl;
            }
        }
    }

    std::cout << "----------------" << std::endl;
    std::string r3 = "([a-z]+)\\.([a-z]+)";
    std::cout << "r3: " << r3 << std::endl;
    // Extraction of several sub-matches
    const std::regex pieces_regex(r3);
    std::smatch pieces_match;

    for (const auto& fname : fnames)
        if (std::regex_match(fname, pieces_match, pieces_regex))
        {
            std::cout << fname << '\n';
            for (std::size_t i = 0; i < pieces_match.size(); ++i)
            {
                std::ssub_match sub_match = pieces_match[i];
                std::string piece = sub_match.str();
                std::cout << "  submatch " << i << ": " << piece << '\n';
            }
        }
}

1.2 search

#include <cstddef>
#include <iostream>
#include <regex>
#include <string>

void repeat_search_position() {
    std::string text = "this is a test and te09";
    std::string t = text;
    // C-style string demo
    std::cmatch cm;
    decltype(cm.prefix().length()) pos = 0;
    for (std::smatch sm; std::regex_search(t, sm, std::regex("te[a-z0-9]+"));)  {
        pos = pos + sm.prefix().length();
        for(decltype(sm.size()) i = 0; i < sm.size(); i++) {
            std::cout << "Found '" << sm[i].str() << "' at position " << pos << ", sm.size(): " << sm.size()  << '\n';
        }
        pos = pos + sm.str().size();
        t = sm.suffix();
    }
}

int main()
{
    std::cout << "---------------------" << std::endl;
    repeat_search_position();
}

2. wregex 宽字符,需要转换为 wstring unicode

#include <iostream>
#include <regex>
#include <string>
#include <vector>

std::string xxregex(const std::string &text, const std::string &reg, const std::vector<std::string> &filters) {
    std::wstring_convert<std::codecvt_utf8<wchar_t>> b2u;
    std::wstring_convert<std::codecvt_utf8<wchar_t>> u2b;
    std::map<std::wstring, bool> filter_mp;
    for(auto &f : filters) {
        EASYLOG_DEBUG("filter: %s", f.data());
        filter_mp[b2u.from_bytes(f)] = true;
    }
    EASYLOG_DEBUG("reg: %s", reg.data());
    const std::wregex re(b2u.from_bytes(reg));

    json outs = json::array();

    std::wstring wtext = b2u.from_bytes(text);
    std::wstring t = wtext;
    std::wsmatch wsm;
    decltype(wsm.prefix().length()) pos = 0;
    for(; std::regex_search(t, wsm, re);) {
        pos = pos + wsm.prefix().length();
        auto word = wsm.str();
        EASYLOG_DEBUG("Found '%ls' at position: %lu", word.data(), pos);
        json res = { 
            {"word", u2b.to_bytes(word)},
            {"type", ""},
            {"position", u2b.to_bytes(wtext.substr(0, pos)).size()},
            {"context", text},
            {"snapshot", ""} 
        };
        EASYLOG_DEBUG("pos: %d, content: %s", res.value("position", 0), text.substr(res.value("position", 0), 10).data());
        pos = pos + word.size();
        t = wsm.suffix();
        EASYLOG_DEBUG("new text: %ls", t.data());

        if(filter_mp.count(word) > 0) {
            // 白名单
            continue;
        }   
        EASYLOG_DEBUG("%ls not in filters", word.data());
        outs.push_back(res);
    }   

    return outs.dump();
}
11 浏览
9 爬虫
0 评论