【发布时间】:2021-12-15 05:20:58
【问题描述】:
我正在尝试将多循环函数转换为多处理函数以利用多处理。
初始代码如下所示:
from tqdm import tqdm
import re
import re, string
def transform_data(sentence, aug):
# some processing from different module function
out = [re.sub('[%s]' % re.escape(string.punctuation), '', k) for k in sentence]
return out
def bulk_aug(final_df_num):
final_aug_data = []
for data_keys in tqdm(final_df_num):
aug_data = []
random_key = data_keys['key']
sentences = data_keys['data']
for chunked_sentence in tqdm(sentences):
ug_d = transform_data(chunked_sentence, 2)
aug_data.append(ug_d)
print(aug_data)
final_aug_data.append({'key': random_key,
'data': [" ".join(k) for k in list(zip(*aug_data))]})
return final_aug_data
数据如下:
data = [{'key': 12, 'data': [['this is a ?sentence1', 'this is a sentence1'], ['this is a sentence2', 'this is a sentence2'],
['this is a sentence3', 'this is a sentence3'], ['this is a sentence4', 'this is a sentence4']]},
{'key': 190, 'data': [['this is a sentence11', 'this is a sentence11'], ['this is a sentence22', 'this is a sentence22'],
['this is a sentence33', 'this is a sentence33'], ['this is a sentence44', 'this is a sentence44']]},
{'key': 1900, 'data': [['this is a sentence55', 'this is a sentence55'], ['this is a sentence66', 'this is a sentence66'],
['this is a sentence77', 'this is a sentence77'], ['this is a sentence88', 'this is a sentence88']]}]
输出如下所示:
# bulk_aug(data)
[{'key': 12,
'data': ['this is a sentence1 this is a sentence2 this is a sentence3 this is a sentence4',
'this is a sentence1 this is a sentence2 this is a sentence3 this is a sentence4']},
{'key': 190,
'data': ['this is a sentence11 this is a sentence22 this is a sentence33 this is a sentence44',
'this is a sentence11 this is a sentence22 this is a sentence33 this is a sentence44']},
{'key': 1900,
'data': ['this is a sentence55 this is a sentence66 this is a sentence77 this is a sentence88',
'this is a sentence55 this is a sentence66 this is a sentence77 this is a sentence88']}]
我想将此函数转换为多处理函数。我尝试了什么:
def bulk_aug(final_df_num):
final_aug_data = []
aug_data = []
data_keys = final_df_num
random_key = data_keys['key']
sentences = data_keys['data']
for chunked_sentence in tqdm(sentences):
ug_d = transform_data(chunked_sentence, 2)
aug_data.append(ug_d)
print(aug_data)
final_aug_data.append({'key': random_key,
'data': [" ".join(k) for k in list(zip(*aug_data))]})
return final_aug_data
from multiprocessing import Pool
import time
with Pool(2) as p:
r = list(tqdm(p.imap(bulk_aug, data), total=len(data)))
但是处理数据需要花费太多时间。代码有问题吗?
【问题讨论】:
-
你确定这是由于计算时间而不是
imap中IPC的开销? -
您确定发布了您正在运行的实际代码吗?您的串行代码不完整,并且您的多处理代码会生成 TypeError: string indices must be integers 错误(与您的串行代码一样)。
标签: python python-3.x multithreading parallel-processing multiprocessing