先检测一下网页里面的 charset
,然后根据 charset 进行解码。
兼容顺序:gb18030
> gbk
> gb2312
targetcode = {
"GBK": "GB18030",
"GB18030": "GB18030",
"GB2312": "GB18030",
"LATIN1": "UTF-8",
"UTF-8": "UTF-8",
"gbk": "gb18030",
"gb18030": "gb18030",
"gb2312": "gb18030",
"latin1": "utf-8",
"utf-8": "utf-8",
}
def get_soup(content):
codes = ['gb18030', 'gb2312', 'latin1', 'utf-8', 'GB18030', 'GB2312', 'LATIN1', 'UTF-8']
for code in codes:
try:
tcode = code
if "charset=".format(code) in str(content):
tcode = targetcode.get(code)
return BeautifulSoup(content.decode(tcode), "html.parser")
except Exception as e:
#print(str(e), file=sys.stderr)
pass
for code in codes:
try:
return BeautifulSoup(content.decode(code), "html.parser")
except Exception as e:
#print(str(e), file=sys.stderr)
pass
return None