我将所有单词标记化并将它们作为一个序列填充为列表列表。然后我将第一个列表与第二个构建字符串缓冲区进行比较,并在索引长度计数不同时进行匹配。然后我在最后删除了 out1 和 out2 的重复索引值
from keras.preprocessing.text import Tokenizer
tokenizer=Tokenizer()
# list 1
list1 = [["When"], ["Grazia Deledda"], ["submitted a short story"], ["to"],
["a"], ["fashion magazine"], ["at"], ["the age of"], ["13"],["EOS"]]
# list 2
list2 = [["When"], ["Grazia Deledda"], ["submitted"], ["a short story"], ["to"],
["a fashion"], ["magazine at"], ["the age of"], ["13"],["EOS"]]
tokenizer.fit_on_texts([" ".join(item) for item in list1])
tokenizer.fit_on_texts([" ".join(item) for item in list2])
seq1=[]
seq2=[]
for item1,item2 in zip(list1,list2):
seq1.append(tokenizer.texts_to_sequences(item1))
seq2.append(tokenizer.texts_to_sequences(item2))
out1=[]
out2=[]
out1_buffer=[]
out2_buffer=[]
current_index=0
string1=""
for seq1_index in range(len(seq1)-1):
string1=""
index=0
out1_buffer=[]
found=False
#check each seq1 string accumulation until a match is found or the end of queue is detect 16 - maps to eos
while seq1[seq1_index+index][0] != [16] and found==False:
out1_buffer.append(seq1_index+index)
seq_string=" ".join([str(token) for token in seq1[seq1_index+index][0]])
if string1=="":
string1=seq_string
else:
string1+=" "+seq_string
string2=""
out2_buffer=[]
for seq2_index in range(current_index,len(seq2)-1):
seq_string=" ".join([str(token) for token in seq2[seq2_index][0]])
if string2=="":
string2=seq_string
else:
string2+=" "+seq_string
out2_buffer.append(seq2_index)
count_seq1=len(out1_buffer)
count_seq2=len(out2_buffer)
if string1==string2 and count_seq1!=count_seq2:
print("string_a", [list1[int(index)] for index in out1_buffer])
print("string_b",[list2[int(index)] for index in out2_buffer])
current_index=seq2_index+1
print("match",count_seq1,count_seq2)
for index1 in out1_buffer:
out1.append(index1)
for index2 in out2_buffer:
out2.append(index2)
out1_buffer=[]
out2_buffer=[]
found=True
break
index+=1
tuple1=[]
tuple2=[]
result1=[]
for item1 in out1:
found=False
for item2 in out2:
if list1[item1]==list2[item2]:
found=True
break
if found==True:
out2 = list(filter(lambda item2: list1[item1]!=list2[item2],out2))
if found==False:
result1.append(item1)
for item1 in result1:
tuple1.append(list1[item1])
for item2 in out2:
tuple2.append(list2[item2])
tuple1=tuple(tuple1)
tuple2=tuple(tuple2)
print("{}\n{}\n".format(tuple1,tuple2))
输出
(['submitted a short story'], ['a'], ['fashion magazine'], ['at'])
(['submitted'], ['a short story'], ['a fashion'], ['magazine at'])