# detect charset of this webpage
chardit1 = chardet.detect(webPage)
charset = chardit1['encoding']
# convert it into UTF-8 if necessary
if charset.lower() != "utf-8":
print("charset=", charset, "convert to string in UTF-8")
webPageInUnicode = webPage.decode(charset, "ignore")
webPage = webPageInUnicode.encode("UTF-8")
For more information:
https://docs.python.org/2/howto/unicode.html#the-unicode-type
No comments:
Post a Comment