【发布时间】:2016-07-04 16:27:52
【问题描述】:
我正在通过他们的 API 提取 Twitter 数据,其中一条推文有一个特殊字符(右撇号),我不断收到错误消息,说 Python 无法映射或字符映射该字符。我已经查看了整个互联网,但我还没有找到解决这个问题的方法。我只想用 Python 可以识别的撇号或空字符串(基本上将其删除)替换该字符。我正在使用 Python 3.3。有关如何解决此问题的任何意见?这可能看起来很简单,但我是 Python 的新手。
编辑:这是我用来尝试过滤出引发错误的 unicode 字符的函数。
@staticmethod
def UnicodeFilter(var):
temp = var
temp = temp.replace(chr(2019), "'")
temp = Functions.ToSQL(temp)
return temp
另外,运行程序时,我的错误如下。
'charmap' 编解码器无法对位置 59 中的字符 '\u2019' 进行编码:字符映射到 'undefined'
编辑:这是我的源代码示例:
import json
import mysql.connector
import unicodedata
from MySQLCL import MySQLCL
class Functions(object):
"""This is a class for Python functions"""
@staticmethod
def Clean(string):
temp = str(string)
temp = temp.replace("'", "").replace("(", "").replace(")", "").replace(",", "").strip()
return temp
@staticmethod
def ParseTweet(string):
for x in range(0, len(string)):
tweetid = string[x]["id_str"]
tweetcreated = string[x]["created_at"]
tweettext = string[x]["text"]
tweetsource = string[x]["source"]
truncated = string[x]["truncated"]
inreplytostatusid = string[x]["in_reply_to_status_id"]
inreplytouserid = string[x]["in_reply_to_user_id"]
inreplytoscreenname = string[x]["in_reply_to_screen_name"]
geo = string[x]["geo"]
coordinates = string[x]["coordinates"]
place = string[x]["place"]
contributors = string[x]["contributors"]
isquotestatus = string[x]["is_quote_status"]
retweetcount = string[x]["retweet_count"]
favoritecount = string[x]["favorite_count"]
favorited = string[x]["favorited"]
retweeted = string[x]["retweeted"]
possiblysensitive = string[x]["possibly_sensitive"]
language = string[x]["lang"]
print(Functions.UnicodeFilter(tweettext))
#print("INSERT INTO tweet(ExTweetID, TweetText, Truncated, InReplyToStatusID, InReplyToUserID, InReplyToScreenName, IsQuoteStatus, RetweetCount, FavoriteCount, Favorited, Retweeted, Language, TweetDate, TweetSource, PossiblySensitive) VALUES (" + str(tweetid) + ", '" + Functions.UnicodeFilter(tweettext) + "', " + str(truncated) + ", " + Functions.CheckNull(inreplytostatusid) + ", " + Functions.CheckNull(inreplytouserid) + ", '" + Functions.CheckNull(inreplytoscreenname) + "', " + str(isquotestatus) + ", " + str(retweetcount) + ", " + str(favoritecount) + ", " + str(favorited) + ", " + str(retweeted) + ", '" + str(language) + "', '" + Functions.ToSQL(tweetcreated) + "', '" + Functions.ToSQL(tweetsource) + "', " + str(possiblysensitive) + ")")
#MySQLCL.Set("INSERT INTO tweet(ExTweetID, TweetText, Truncated, InReplyToStatusID, InReplyToUserID, InReplyToScreenName, IsQuoteStatus, RetweetCount, FavoriteCount, Favorited, Retweeted, Language, TweetDate, TweetSource, PossiblySensitive) VALUES (" + str(tweetid) + ", '" + tweettext + "', " + str(truncated) + ", " + Functions.CheckNull(inreplytostatusid) + ", " + Functions.CheckNull(inreplytouserid) + ", '" + Functions.CheckNull(inreplytoscreenname) + "', " + str(isquotestatus) + ", " + str(retweetcount) + ", " + str(favoritecount) + ", " + str(favorited) + ", " + str(retweeted) + ", '" + language + "', '" + tweetcreated + "', '" + str(tweetsource) + "', " + str(possiblysensitive) + ")")
@staticmethod
def ToBool(variable):
if variable.lower() == 'true':
return True
elif variable.lower() == 'false':
return False
@staticmethod
def CheckNull(var):
if var == None:
return ""
else:
return var
@staticmethod
def ToSQL(var):
temp = var
temp = temp.replace("'", "''")
return str(temp)
@staticmethod
def UnicodeFilter(var):
temp = var
#temp = temp.replace(chr(2019), "'")
unicodestr = unicode(temp, 'utf-8')
if unicodestr != temp:
temp = "'"
temp = Functions.ToSQL(temp)
return temp
ekhumoro's 响应正确。
【问题讨论】:
-
你能展示一个数据样本和你的代码吗?
-
感谢您添加 一点 更多信息,但不知道您是如何获得数据的,也不知道究竟是哪一行产生了错误,这很难提供帮助。
标签: python python-3.x unicode