【问题标题】:Does Spacy support multiple GPUs?Spacy 是否支持多个 GPU?
【发布时间】:2021-09-28 18:03:22
【问题描述】:

我想知道 Spacy 是否通过 mpi4py 支持多 GPU?

我目前在支持 MPI 协议并具有许多 GPU 的高性能计算集群上使用 Spacy 的 nlp.pipe 进行命名实体识别。它说here 我需要指定 GPU 与 cupy 一起使用,但是对于 PyMPI,我不确定以下是否可行(我应该在调用 cupy 设备后导入 spacy 吗?):


from mpi4py import MPI
import cupy

comm = MPI.COMM_WORLD
rank = comm.Get_rank()

if rank == 0:
    data = ["His friend Nicolas J. Smith is here with Bart Simpon and Fred."*100]
else:
    data = None

unit = comm.scatter(data, root=0)

with cupy.cuda.Device(rank):
    import spacy
    from thinc.api import set_gpu_allocator, require_gpu
    set_gpu_allocator("pytorch")
    require_gpu(rank)
    nlp = spacy.load('en_core_web_lg')
    nlp.add_pipe("merge_entities")
    tmp_list = []
    for doc in nlp.pipe(unit):
        res = " ".join([t.text if not t.ent_type_ else t.ent_type_ for t in doc])
        tmp_list.append(res)

result = comm.gather(tmp_list, root=0)

if comm.rank == 0:
    print (result)
else:
    result = None

或者如果我在同一台机器上有 4 个 GPU 并且我不想使用 MPI,我可以执行以下操作:

from joblib import Parallel, delayed
import cupy

rank = 0

def chunker(iterable, total_length, chunksize):
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    "Flatten a list of lists to a combined list"
    return [item for sublist in list_of_lists for item in sublist]

def process_chunk(texts):
    with cupy.cuda.Device(rank):
        import spacy
        from thinc.api import set_gpu_allocator, require_gpu
        set_gpu_allocator("pytorch")
        require_gpu(rank)
        preproc_pipe = []
        for doc in nlp.pipe(texts, batch_size=20):
            preproc_pipe.append(lemmatize_pipe(doc))
        rank+=1
        return preproc_pipe

def preprocess_parallel(texts, chunksize=100):
    executor = Parallel(n_jobs=4, backend='multiprocessing', prefer="processes")
    do = delayed(process_chunk)
    tasks = (do(chunk) for chunk in chunker(texts, len(texts), chunksize=chunksize))
    result = executor(tasks)
    return flatten(result)

preprocess_parallel(texts = ["His friend Nicolas J. Smith is here with Bart Simpon and Fred."*100], chunksize=1000)

【问题讨论】:

    标签: python-3.x nlp mpi spacy gensim


    【解决方案1】:

    我想我已经知道如何做到这一点了:

    关键是让cupy使用新的GPU。

    import multiprocessing as mp
    mp.set_start_method('spawn', force=True)
    from joblib import Parallel, delayed
    from itertools import cycle
    import cupy
    import spacy
    from thinc.api import set_gpu_allocator, require_gpu
    
    
    def chunker(iterable, total_length, chunksize):
        return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))
    
    def flatten(list_of_lists):
        "Flatten a list of lists to a combined list"
        return [item for sublist in list_of_lists for item in sublist]
    
    def process_entity(doc):
        super_word_ls = []
        for s in doc.sents:
            word_ls = []
            for t in s:
                if not t.ent_type_:
                    if (t.text.strip()!=""):
                        word_ls.append(t.text)
                else:
                    word_ls.append(t.ent_type_)
            if len(word_ls)>0:
                super_word_ls.append(" ".join(word_ls))
        return " ".join(super_word_ls)
    
    def process_chunk(texts, rank):
        print(rank)
        with cupy.cuda.Device(rank):
            set_gpu_allocator("pytorch")
            require_gpu(rank)
            nlp = spacy.load("en_core_web_trf")
            preproc_pipe = []
            for doc in nlp.pipe(texts, batch_size=20):
                preproc_pipe.append(process_entity(doc))
            rank+=1
            return preproc_pipe
    
    
    def preprocess_parallel(texts, chunksize=100):
        executor = Parallel(n_jobs=2, backend='multiprocessing', prefer="processes")
        do = delayed(process_chunk)
        tasks = []
        gpus = list(range(0, cupy.cuda.runtime.getDeviceCount()))
        rank = 0
        for chunk in chunker(texts, len(texts), chunksize=chunksize):
            tasks.append(do(chunk, rank))
            rank = (rank+1)%len(gpus)
        result = executor(tasks)
        return flatten(result)
    
    if __name__ == '__main__':
        print(preprocess_parallel(texts = ["His friend Nicolas J. Smith is here with Bart Simpon and Fred."]*100, chunksize=50))
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2012-01-09
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多