1. char 类型 string 的正则
1.1 match
#include <cstddef>
#include <iostream>
#include <regex>
#include <string>
int main()
{
// Simple regular expression matching
const std::string fnames[] = {"foo.txt", "bar.txt", "baz.dat", "zoidberg", "001-abc.txt"};
const std::string reg = "[a-z]+\\.txt";
const std::regex txt_regex(reg);
std::cout << "regex: " << reg << std::endl;
for (const auto& fname : fnames)
std::cout << fname << " match: " << (std::regex_match(fname, txt_regex) ? "true" : "false") << '\n';
std::cout << "----------------" << std::endl;
// Extraction of a sub-matches
const std::string r2 = "([a-z]+)\\.txt";
const std::regex base_regex(r2);
std::cout << "r2: " << r2 << std::endl;
for (const auto& fname : fnames) {
std::cout << fname << std::endl;
std::smatch base_match;
if (std::regex_match(fname, base_match, base_regex)) {
// The first sub_match is the whole string; the next
// sub_match is the first parenthesized expression.
//if (base_match.size() == 2)
//{
// std::ssub_match base_sub_match = base_match[1];
// //std::string base = base_sub_match.str();
// std::string base = base_match[1].str();
// std::cout << fname << " has a base of " << base << '\n';
//}
for(decltype(base_match.size()) i = 0; i < base_match.size(); i++) {
std::cout << " sub match[" << i << "]: " << base_match[i].str() << std::endl;
}
}
}
std::cout << "----------------" << std::endl;
std::string r3 = "([a-z]+)\\.([a-z]+)";
std::cout << "r3: " << r3 << std::endl;
// Extraction of several sub-matches
const std::regex pieces_regex(r3);
std::smatch pieces_match;
for (const auto& fname : fnames)
if (std::regex_match(fname, pieces_match, pieces_regex))
{
std::cout << fname << '\n';
for (std::size_t i = 0; i < pieces_match.size(); ++i)
{
std::ssub_match sub_match = pieces_match[i];
std::string piece = sub_match.str();
std::cout << " submatch " << i << ": " << piece << '\n';
}
}
}
1.2 search
#include <cstddef>
#include <iostream>
#include <regex>
#include <string>
void repeat_search_position() {
std::string text = "this is a test and te09";
std::string t = text;
// C-style string demo
std::cmatch cm;
decltype(cm.prefix().length()) pos = 0;
for (std::smatch sm; std::regex_search(t, sm, std::regex("te[a-z0-9]+"));) {
pos = pos + sm.prefix().length();
for(decltype(sm.size()) i = 0; i < sm.size(); i++) {
std::cout << "Found '" << sm[i].str() << "' at position " << pos << ", sm.size(): " << sm.size() << '\n';
}
pos = pos + sm.str().size();
t = sm.suffix();
}
}
int main()
{
std::cout << "---------------------" << std::endl;
repeat_search_position();
}
2. wregex 宽字符,需要转换为 wstring unicode
#include <iostream>
#include <regex>
#include <string>
#include <vector>
std::string xxregex(const std::string &text, const std::string ®, const std::vector<std::string> &filters) {
std::wstring_convert<std::codecvt_utf8<wchar_t>> b2u;
std::wstring_convert<std::codecvt_utf8<wchar_t>> u2b;
std::map<std::wstring, bool> filter_mp;
for(auto &f : filters) {
EASYLOG_DEBUG("filter: %s", f.data());
filter_mp[b2u.from_bytes(f)] = true;
}
EASYLOG_DEBUG("reg: %s", reg.data());
const std::wregex re(b2u.from_bytes(reg));
json outs = json::array();
std::wstring wtext = b2u.from_bytes(text);
std::wstring t = wtext;
std::wsmatch wsm;
decltype(wsm.prefix().length()) pos = 0;
for(; std::regex_search(t, wsm, re);) {
pos = pos + wsm.prefix().length();
auto word = wsm.str();
EASYLOG_DEBUG("Found '%ls' at position: %lu", word.data(), pos);
json res = {
{"word", u2b.to_bytes(word)},
{"type", ""},
{"position", u2b.to_bytes(wtext.substr(0, pos)).size()},
{"context", text},
{"snapshot", ""}
};
EASYLOG_DEBUG("pos: %d, content: %s", res.value("position", 0), text.substr(res.value("position", 0), 10).data());
pos = pos + word.size();
t = wsm.suffix();
EASYLOG_DEBUG("new text: %ls", t.data());
if(filter_mp.count(word) > 0) {
// 白名单
continue;
}
EASYLOG_DEBUG("%ls not in filters", word.data());
outs.push_back(res);
}
return outs.dump();
}