【发布时间】:2019-12-21 18:31:28
【问题描述】:
我的进程要么一个接一个地开始,要么它们开始(同时)但没有调用指向函数。我尝试了许多变体,它不会像许多教程教的那样起作用。 我的目标是模糊字符串匹配一个 80k 项目的文本句子列表,丢弃不必要的 90%+ 匹配,同时保持字符串的信息最多(scorer=fuzz.token_set_ratio)。 谢谢!
IDE 是 Anaconda Spyder 4.0、IPython 7.10.1、Python 3.7.5
# -*- coding: utf-8 -*-
import pandas as pd
import multiprocessing
import time
from datetime import datetime
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
#########
preparedDF = []
df1 = []
df2 = []
df3 = []
df4 = []
df5 = []
df6 = []
df7 = []
df8 = []
#########
xdf1 = []
xdf2 = []
xdf3 = []
xdf4 = []
xdf5 = []
xdf6 = []
xdf7 = []
xdf8 = []
#########
def fuzzyPrepare():
#load data do some easy cleaning
global preparedDF
df = pd.read_csv("newEN.csv")
df = df["description"].fillna("#####").tolist()
df = list(dict.fromkeys(df))
try:
df = df.remove("#####")
except ValueError:
pass
preparedDF=df
def fuzzySplit(df=preparedDF):
#split data to feed processes
global df1, df2, df3, df4, df5, df6, df7, df8
df1 = df[:100]
df2 = df[100:200]
df3 = df[200:300]
df4 = df[300:400]
df5 = df[400:500]
df6 = df[500:600]
df7 = df[600:700]
df8 = df[700:800]
def fuzzyMatch(x):
#process.dedupe returns dict_keys object so pass it to a list()
global xdf1, xdf2, xdf3, xdf4, xdf5, xdf6, xdf7, xdf8
if x == 1:
xdf1=list(process.dedupe(df1,threshold=90,scorer=fuzz.token_set_ratio))
elif x == 2:
xdf2=list(process.dedupe(df2,threshold=90,scorer=fuzz.token_set_ratio))
elif x == 3:
xdf3=list(process.dedupe(df3,threshold=90,scorer=fuzz.token_set_ratio))
elif x == 4:
xdf4=list(process.dedupe(df4,threshold=90,scorer=fuzz.token_set_ratio))
elif x == 5:
xdf5=list(process.dedupe(df5,threshold=90,scorer=fuzz.token_set_ratio))
elif x == 6:
xdf6=list(process.dedupe(df6,threshold=90,scorer=fuzz.token_set_ratio))
elif x == 7:
xdf7=list(process.dedupe(df7,threshold=90,scorer=fuzz.token_set_ratio))
elif x == 8:
xdf8=list(process.dedupe(df8,threshold=90,scorer=fuzz.token_set_ratio))
else:
return "error in fuzzyCases!"
#if __name__ == '__main__':
fuzzyPrepare()
fuzzySplit(preparedDF)
#UNHEEDED MULTIPROCESSING, ONLY THIS LINE TRIGGERS THE ACTUAL FUNCTION -> p1 = multiprocessing.Process(name="p1",target=fuzzyMatch(1), args=(1,))
p1 = multiprocessing.Process(name="p1",target=fuzzyMatch, args=(1,))
p2 = multiprocessing.Process(name="p2",target=fuzzyMatch, args=(2,))
p3 = multiprocessing.Process(name="p3",target=fuzzyMatch, args=(3,))
p4 = multiprocessing.Process(name="p4",target=fuzzyMatch, args=(4,))
p5 = multiprocessing.Process(name="p5",target=fuzzyMatch, args=(5,))
p6 = multiprocessing.Process(name="p6",target=fuzzyMatch, args=(6,))
p7 = multiprocessing.Process(name="p7",target=fuzzyMatch, args=(7,))
p8 = multiprocessing.Process(name="p8",target=fuzzyMatch, args=(8,))
jobs = []
jobs.append(p1)
jobs.append(p2)
jobs.append(p3)
jobs.append(p4)
jobs.append(p5)
jobs.append(p6)
jobs.append(p7)
jobs.append(p8)
for j in jobs:
print("process "+ j.name +" started at "+ datetime.now().strftime('%H:%M:%S'))
j.start()
time.sleep(0.3)
for j in jobs:
j.join()
print ("processing complete at "+datetime.now().strftime('%H:%M:%S'))
【问题讨论】:
-
multiprocessing不共享内存。生成Process后对全局变量的更改在进程之间不可见。
标签: python multithreading duplicates multiprocessing fuzzywuzzy