根据提供的字符串...
试试:
import pandas as pd
import re
# import unidecode
data = {'Name': ["LOVABLE Lovable Period Panties Slip da Ciclo Mestruale Flusso Medio (Pacco da 2) Donna",
"Laessig LÄSSIG Set di Cucchiaio per bambini 4 pezzi Uni menta/mirtillo",
"Béaba BÉABA, Set di 6 Contenitori per la Pappa per Svezzamento Bebè in Silicone",
"L´Occitane L'OCCITANE - CREMA MANI NUTRIENTE AL BURRO DI KARITÈ PER PELLI SECCHE 150ML"]}
df = pd.DataFrame(data)
def dedupString(s):
'''
Given a string 's' it processes the string and returns a string with duplicated words removed.
- replaces acute accent with single quote
- split string inc. punctuation to list
- sets 'ALL CAPS' words to 'All Caps' words (only during processing)
- loops through list and removes duplicates
- if word has a uppercase in the third char (like L'Oréal) reinstates that
- deduplicates the list and returns the list joined with a " "
'''
#replace acute accent (´) with a single quote (')
s = s.replace("´", "'")
#split the string inc. punctuation. If ticks and dashes etc. go missing from the output
#add them to the end of the second square brackets below. Example -> [.,!?;-HERE]
l = re.findall(r"[\w']+|[.,!?;-]", s)
output = []
seen = set()
#loop through the words
for word in l:
wordAllCaps = False
#if word is all caps record it
if word.isupper():
wordAllCaps = True
#change, for example 'THE' to 'The' (and 'The' to 'The' but hey)
if word[0].isupper():
word = word.capitalize()
#if the word is more than 3 chars
if len(word) > 3:
#and if the word as a single quote as the second char
if word[1] == "'":
#capitialize the third char in the word so "L'oréal" becomes "L'Oréal"
word = ''.join([word[:2], word[2].upper(), word[2 + 1:]])
#if the current word hasn't been seen before
if word not in seen:
#add it to seen
seen.add(word)
#if the word was originally all caps (like 'FOOBAR' but currently 'Foobar') change it back
if wordAllCaps:
word = word.upper()
#add word to the output string
output.append(word)
#return the list of words joined with spaces
return ' '.join(output)
df['Name2'] = df['Name']
# df['Name2'] = df['Name2'].apply(unidecode.unidecode)
df['Name2'] = df.apply(lambda x: dedupString(x['Name2']), axis=1)
df['Name2'] = df['Name2'].str.replace(' , ', ', ', regex=False)
print(df)
输出:
Name \
0 LOVABLE Lovable Period Panties Slip da Ciclo M...
1 Laessig LÄSSIG Set di Cucchiaio per bambini 4 ...
2 Béaba BÉABA, Set di 6 Contenitori per la Pappa...
3 L´Occitane L'OCCITANE - CREMA MANI NUTRIENTE A...
Name2
0 LOVABLE Period Panties Slip da Ciclo Mestruale...
1 Laessig LÄSSIG Set di Cucchiaio per bambini 4 ...
2 Béaba, Set di 6 Contenitori per la Pappa Svezz...
3 L'Occitane - CREMA MANI NUTRIENTE AL BURRO DI ...
注意:
-
LOVABLE Lovable 变为 LOVABLE,因为第一个单词被保留。同样,Béaba BÉABA, 变为 Béaba,,因为标点符号移到原来的第一个单词中。
- 如果您愿意在上面的代码中覆盖现有列,请将
df['Name2'] = 更改为df['Name'] =。我建议在删除原始字符串列之前检查/采样输出。
- 我已经注释掉了可以删除 unicode 的几行(3 和 59)(未经测试)。我暂时把它放在了外面,但如果需要,它就在那里。在检查较大的数据集时,您可以查看 unicode 字符是否会导致问题(例如,
façade Facade 之类的字符串 - 是否匹配为重复项是存在的问题。在删除重复项之前交换 unicode(取消注释第 3 行和第 59 行以及试试看)或保持原样。
这适用于给定的字符串。如果字符消失,请注意代码中的注释(随着数据集的增长,您可能需要更改正则表达式)...
#split the strings inc. punctuation. If ticks and dashes etc. go missing from the output
#add them to the end of the second square brackets below. Example -> [.,!?;-HERE]
l = re.findall(r"[\w']+|[.,!?;-]", s)
补充:
如果您的预期输出是 Laessig LÄSSIG 变为 Laessig 尝试:
import pandas as pd
import re
import unidecode
data = {'Name': ["LOVABLE Lovable Period Panties Slip da Ciclo Mestruale Flusso Medio (Pacco da 2) Donna",
"Laessig LÄSSIG Set di Cucchiaio per bambini 4 pezzi Uni menta/mirtillo",
"Béaba BÉABA, Set di 6 Contenitori per la Pappa per Svezzamento Bebè in Silicone",
"L´Occitane L'OCCITANE - CREMA MANI NUTRIENTE AL BURRO DI KARITÈ PER PELLI SECCHE 150ML"]}
df = pd.DataFrame(data)
swaps = {"ä":"ae",
#"ö":"oe",
"ü":"ue",
"Ä":"Ae",
#"Ö":"Oe",
"Ü":"Ue",
"ß":"ss"}
def toASCII(s):
'''
Input is a string;
- if the string contains any char in the keys of 'swaps' replace that char
- sets words that are ALL CAPS to All Caps for consistent output
'''
#if the string has a char that is in the keys of 'swaps'
if any(e in swaps.keys() for e in s):
#for each word
for w in s.split():
#if the word is ALL CAPS
if w.isupper():
#make it All Caps
s = s.replace(w, w.capitalize())
#replace, for example 'ä' with 'ae'
for w, l in swaps.items():
s = s.replace(w, l)
return s
def dedupString(s):
'''
Given a string 's' it processes the string and returns a string with duplicated words removed.
- replaces acute accent with single quote
- split string inc. punctuation to list
- sets 'ALL CAPS' words to 'All Caps' words (only during processing)
- loops through list and removes duplicates
- if word has a uppercase in the third char (like L'Oréal) reinstates that
- deduplicates the list and returns the list joined with a " "
'''
#replace acute accent (´) with a single quote (')
s = s.replace("´", "'")
#split the string inc. punctuation. If ticks and dashes etc. go missing from the output
#add them to the end of the second square brackets below. Example -> [.,!?;-HERE]
l = re.findall(r"[\w']+|[.,!?;-]", s)
output = []
seen = set()
#loop through the words
for word in l:
wordAllCaps = False
#if word is all caps record it
if word.isupper():
wordAllCaps = True
#change, for example 'THE' to 'The' (and 'The' to 'The' but hey)
if word[0].isupper():
word = word.capitalize()
#if the word is more than 3 chars
if len(word) > 3:
#and if the word as a single quote as the second char
if word[1] == "'":
#capitialize the third char in the word so "L'oréal" becomes "L'Oréal"
word = ''.join([word[:2], word[2].upper(), word[2 + 1:]])
#if the current word hasn't been seen before
if word not in seen:
#add it to seen
seen.add(word)
#if the word was originally all caps (like 'FOOBAR' but currently 'Foobar') change it back
if wordAllCaps:
word = word.upper()
#add word to the output string
output.append(word)
#return the list of words joined with spaces
return ' '.join(output)
df['Name2'] = df['Name']
df['Name2'] = df.apply(lambda x: toASCII(x['Name2']), axis=1)
df['Name2'] = df['Name2'].apply(unidecode.unidecode)
df['Name2'] = df.apply(lambda x: dedupString(x['Name2']), axis=1)
df['Name2'] = df['Name2'].str.replace(' , ', ', ', regex=False)
print(df)
输出:
Name \
0 LOVABLE Lovable Period Panties Slip da Ciclo M...
1 Laessig LÄSSIG Set di Cucchiaio per bambini 4 ...
2 Béaba BÉABA, Set di 6 Contenitori per la Pappa...
3 L´Occitane L'OCCITANE - CREMA MANI NUTRIENTE A...
Name2
0 LOVABLE Period Panties Slip da Ciclo Mestruale...
1 Laessig Set di Cucchiaio per bambini 4 pezzi U...
2 Beaba, Set di 6 Contenitori per la Pappa Svezz...
3 L'Occitane - CREMA MANI NUTRIENTE AL BURRO DI ...
显然,对于更大的数据集,您必须查看您是否对swaps 字典感到满意。我已经注释掉了一些东西,例如,您可能不希望像 Björn(如果存在于更大的集合中)这样的词被转换等。