【发布时间】:2020-09-16 21:43:45
【问题描述】:
我正在尝试用字符串中的空字符串替换所有 URL。 JSON 之后是一个字符串。它不是一个对象。但我很难捕捉到各种排列。
这是我的 python 脚本。但是,如果您查看 https://regex101.com/r/r6tQ3B/2/,您会注意到正则表达式还会删除结尾 ",并且也不会真正捕获速记“t.co”或中间的网址。
for filename in dataFiles:
path = 'data/' + filename
with open(path) as r:
text = re.sub(r'https?:\/\/\S*', '"', text, flags=re.MULTILINE)
with open(path, "w") as w:
w.write(text)
测试:https://regex101.com/r/r6tQ3B/1/
{
"created_at":"Fri Aug 12 10:04:00 +0000 2016",
"id":764039724818272256,
"text":"@theblaze https://t.com/TY9DlZ584c @realDonaldTrump https://t.com/TY9DlZ584c",
"in_reply_to_screen_name":"theblaze",
"source":"<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>",
"user":{
"id":366636488,
"id_str":"366636488",
"name":"GIL DUPUY",
"screen_name":"DUPUY77",
"location":"Miami",
"url":"http://ggm-dupuy.com",
"description":"Fashion photographer, love action and adventure, care for the less fortunate, don't tolerate any kind of racism regardless of race or religion",
"verified":false,
"followers_count":186,
"friends_count":446,
"utc_offset":null,
"time_zone":null,
"lang":"en",
"default_profile_image":false,
"following":null,
"notifications":null
},
"geo":null,
"coordinates":null,
"place":{
"name":"Frontenac",
"full_name":"Frontenac, MO",
"country_code":"US",
"country":"United States",
"attributes":{
}
},
"retweet_count":0,
"favorite_count":0,
"extended_entities":{
"media":[
{
"id":764039718237409281,
"id_str":"764039718237409281",
"indices":[
27,
50
],
"media_url":"http://pbs.twimg.com/media/CppqE1_UkAE2qFj.jpg",
"media_url_https":"https://pbs.twimg.com/media/CppqE1_UkAE2qFj.jpg",
"url":"https://t.com/TY9DlZ584c",
"display_url":"pic.twitter.com/TY9DlZ584c",
"expanded_url":"http://twitter.com/DUPUY77/status/764039724818272256/photo/1",
"type":"photo",
"sizes":{
"medium":{
"w":640,
"h":1136,
"resize":"fit"
},
"large":{
"w":640,
"h":1136,
"resize":"fit"
},
"thumb":{
"w":150,
"h":150,
"resize":"crop"
},
"small":{
"w":383,
"h":680,
"resize":"fit"
}
}
}
]
},
"favorited":false,
"retweeted":false,
"possibly_sensitive":false,
"lang":"und"
}
【问题讨论】: