【问题标题】:Parallelise / Threading a big for loop | Python并行化/线程化一个大的for循环| Python
【发布时间】:2021-07-07 08:44:06
【问题描述】:

我有一个可用的Jupyter Notebook,它以dictionary 的形式模拟假数据。使用Faker 库和其他基本Python

阅读其他帖子,parallelism 似乎用于方法。但是,我想将这种技术应用于大for loop;因为我在此过程中应用了更多“键值列表”

注意:我已在流程中附加了 list slicing 的 cmets,以备不时之需。

是否可以一次运行for loop 的多次迭代? (或尽可能多)

from faker import Faker
faker = Faker()
Faker.seed(1)

import pandas as pd

import random
random.seed(1)

# Key Value lists
biographic_keys = [['Name', 'faker.unique.name()'], ['Aliases', 'faker.unique.user_name()'], ['Date of birth', 'faker.unique.date_of_birth().isoformat()'], ['Phone numbers', 'faker.unique.phone_number()'], ['Addresses', 'faker.unique.address()'], ['Nationality', 'faker.unique.country()'], ['Social Security Number', 'faker.unique.ssn()'], ['Alien File Number', 'random.randrange(1000000, 999999999, random.randint(7, 9))']]
biometric_keys = [['Height', "'{}ft {}inch'.format(random.randint(4, 7), random.randint(0, 11)) if random.randint(0, 1) == 1 else '{}cm'.format(random.randint(100, 200))"], ['Weight', "'{}kg'.format(random.randint(60, 130)) if random.randint(0, 1) == 1 else '{}st {}lb'.format(random.randint(7, 50), random.randint(0, 13))"], ['Eye color', "random.choice(['Amber', 'Blue', 'Brown', 'Gray', 'Green', 'Hazel'])"], ['Hair color', "random.choice(['Brown', 'Blond', 'Black', 'Auburn', 'Red', 'Gray', 'White', 'Bald'])"]]

entries = 4
alien_key_val = []
alien_key_val.append(["Biographic data", biographic_keys])
alien_key_val.append(["Biometric data", biometric_keys])

#print(alien_key_val[0]) # name, subset
#print(alien_key_val[0][0]) # name
#print(alien_key_val[0][1]) # subset
#print(alien_key_val[0][1][0][0]) # key
#print(alien_key_val[0][1][0][1]) # invoke val

# Programmatic key-values
alien_dict = {}

for entry in range(1, entries+1):
    entry_dict = {}
    for i, subset in enumerate(alien_key_val):
        subset_dict = {}
        subset_name = alien_key_val[i][0]
        for data in subset[1]:
            key, invoc = data[0], data[1]
            #if ('faker.unique.' in invoc) or ('random.' in invoc) or ('tf.' in invoc) or ("''.join" in invoc) or ("'{}" in invoc): val = eval(invoc)
            if invoc[-1] != ':': val = eval(invoc)
            else: val = ""
            if 'Identification numbers' in key: val = {i[0]: i[1] for i in val}
            subset_dict.update({key: val})
        entry_dict.update({subset_name: subset_dict})
    alien_dict.update({'id_' + str(entry): entry_dict})

print("\nALIEN_DICT:\n", alien_dict)
>>> ALIEN_DICT:
 {'id_1': {'Biographic data': {'Name': 'Ryan Gallagher', 'Aliases': 'david77', 'Date of birth': '1994-03-12', 'Phone numbers': '(317)066-9074x3915', 'Addresses': '806 Tanya Stream\nNew Jeffreymouth, OH 31051', 'Nationality': 'Guatemala', 'Social Security Number': '237-87-3585', 'Alien File Number': 119580763}, 'Biometric data': {'Height': '4ft 7inch', 'Weight': '120kg', 'Eye color': 'Hazel', 'Hair color': 'White'}}, 'id_2': {'Biographic data': {'Name': 'Tiffany House', 'Aliases': 'jmonroe', 'Date of birth': '1992-12-05', 'Phone numbers': '241-586-8344', 'Addresses': '690 Sanchez Union Suite 625\nChristopherhaven, WI 21957', 'Nationality': 'Maldives', 'Social Security Number': '861-51-6071', 'Alien File Number': 177366680}, 'Biometric data': {'Height': '4ft 6inch', 'Weight': '60kg', 'Eye color': 'Hazel', 'Hair color': 'Bald'}}, 'id_3': {'Biographic data': {'Name': 'Allen Williams DDS', 'Aliases': 'kholland', 'Date of birth': '1973-11-13', 'Phone numbers': '038.836.8595', 'Addresses': '890 Bowers View Apt. 883\nHerringfort, MN 75211', 'Nationality': 'Mexico', 'Social Security Number': '205-65-6774', 'Alien File Number': 775747704}, 'Biometric data': {'Height': '175cm', 'Weight': '27st 0lb', 'Eye color': 'Amber', 'Hair color': 'Brown'}}, 'id_4': {'Biographic data': {'Name': 'Mr. Gregory Ryan', 'Aliases': 'stephen03', 'Date of birth': '1991-12-27', 'Phone numbers': '(892)184-0110', 'Addresses': '41925 Jones Estate Suite 824\nShawnmouth, NJ 15468', 'Nationality': 'Anguilla', 'Social Security Number': '320-50-5626', 'Alien File Number': 655004368}, 'Biometric data': {'Height': '148cm', 'Weight': '34st 11lb', 'Eye color': 'Amber', 'Hair color': 'Auburn'}}}

解决方案附在下方。如果您认为您的解决方案是更好的选择,请添加解决方案。我很想学习未来的其他方法。

【问题讨论】:

  • 您的意思是要同时访问循环中的每个元素吗?多线程怎么样?
  • 认为我愿意:)。有趣,好吧,我该怎么做?最终目标是在保持稳定性的情况下让流程运行得更快。
  • 我认为你可以先循环创建线程。然后,所有线程将开始运行并且不再循环。但我不确定它会变得更快。
  • 根据您的建议,我在下面附加了一个解决方案。但是,如何按id 的顺序将它们加在一起?

标签: python-3.x multithreading dictionary for-loop parallel-processing


【解决方案1】:

灵感来自Simple multithread for loop in Python 顶级解决方案。

使用multiprocessing.dummy as mp 并将我的for loop 大过程转换为function

所有“entry”字典都被收集到listdicts,并按原计划添加到dictionarybig_boi_dict

...

def alien_dict_func(entry):
    # Programmatic key-values
    alien_dict = {}

    entry_dict = {}
    for i, subset in enumerate(alien_key_val):
        subset_dict = {}
        subset_name = alien_key_val[i][0]
        for data in subset[1]:
            key, invoc = data[0], data[1]
            #if ('faker.unique.' in invoc) or ('random.' in invoc) or ('tf.' in invoc) or ("''.join" in invoc) or ("'{}" in invoc): val = eval(invoc)
            if invoc[-1] != ':': val = eval(invoc)
            else: val = ""
            if 'Identification numbers' in key: val = {i[0]: i[1] for i in val}
            subset_dict.update({key: val})
        entry_dict.update({subset_name: subset_dict})
    alien_dict.update({'id_' + str(entry): entry_dict})

    #print("\nALIEN_DICT:\n", alien_dict)
    return alien_dict


import multiprocessing.dummy as mp 

if __name__=="__main__":
    p=mp.Pool(4)
    dicts = p.map(alien_dict_func, range(1, entries+1)) # range(0,1000) if you want to replicate your example
    #print("DICTS: ", dicts)
    big_boi_dict = {}
    for d in dicts: big_boi_dict.update(d)
    print("big_boi_dict: ", big_boi_dict)
    p.close()
    p.join()
>>> big_boi_dict:  {'id_1': {'Biographic data': {'Name': 'Jacob Gaines', 'Aliases': 'laurenswanson', 'Date of birth': '2016-04-20', 'Phone numbers': '630-868-7169x899', 'Addresses': '0340 Lydia Passage Suite 898\nAliciaside, NC 54017', 'Nationality': 'Netherlands Antilles', 'Social Security Number': '646-75-5043', 'Alien File Number': 216185864}, 'Biometric data': {'Height': '4ft 3inch', 'Weight': '84kg', 'Eye color': 'Gray', 'Hair color': 'Blond'}}, 'id_2': {'Biographic data': {'Name': 'Carlos Wallace', 'Aliases': 'andreabray', 'Date of birth': '1927-09-11', 'Phone numbers': '069-056-6401x106', 'Addresses': '7567 Timothy Drive Suite 202\nMichealberg, WY 38137', 'Nationality': 'Zambia', 'Social Security Number': '423-34-8418', 'Alien File Number': 472177351}, 'Biometric data': {'Height': '7ft 0inch', 'Weight': '111kg', 'Eye color': 'Amber', 'Hair color': 'Brown'}}, 'id_3': {'Biographic data': {'Name': 'Jason Hill', 'Aliases': 'kimberly73', 'Date of birth': '2002-11-20', 'Phone numbers': '661.123.2301x4271', 'Addresses': '16908 Amanda Key\nLake Taraville, OH 89507', 'Nationality': 'Italy', 'Social Security Number': '855-86-1944', 'Alien File Number': 20427192}, 'Biometric data': {'Height': '125cm', 'Weight': '77kg', 'Eye color': 'Brown', 'Hair color': 'White'}}, 'id_4': {'Biographic data': {'Name': 'Melinda White PhD', 'Aliases': 'hartmanerica', 'Date of birth': '1917-05-19', 'Phone numbers': '(174)876-1971x2693', 'Addresses': '8478 Kristina Road Suite 710\nSaraview, ND 82480', 'Nationality': 'French Southern Territories', 'Social Security Number': '779-13-3745', 'Alien File Number': 501832948}, 'Biometric data': {'Height': '148cm', 'Weight': '128kg', 'Eye color': 'Gray', 'Hair color': 'Auburn'}}}

【讨论】:

    【解决方案2】:

    对于同时访问循环中的每个元素的问题。我试过类似的东西

    import time
    import threading
    
    
    class handle_data(threading.Thread):
        def __init__(self, threadID, name, counter):
            threading.Thread.__init__(self)
            self.threadID = threadID
            self.name = name
            self.counter = counter
    
        def run(self):   # handle_data
          print ("Starting " + self.name)
          print(self.name, 5, self.counter)
          print("Exiting " + self.name)
    
    
        def stop(self):
            self.running = False    
    
    
    if __name__ == "__main__":
        for data in range(10):
            handle_data(data,"Thread-"+str(data),data).start()
    

    【讨论】:

    • 我现在试试这个
    • @StressdBoi_69420 我只是编辑代码并删除了一个 for 循环
    猜你喜欢
    • 1970-01-01
    • 2012-10-17
    • 2013-09-02
    • 2017-03-17
    • 1970-01-01
    • 2021-04-09
    相关资源
    最近更新 更多