为什么多线程内存分配/释放密集型应用程序不随线程数扩展？答案

【问题标题】：Why multithreaded memory allocate/deallocate intensive application does not scale with number of threads?为什么多线程内存分配/释放密集型应用程序不随线程数扩展？
【发布时间】：2014-11-13 15:47:10
【问题描述】：

注意：

原帖标题

为什么来自 DWScript 的多线程 JSON 解析器不随线程数扩展？

已更改，因为此问题与使用 DWScript 处理 JSON 数据无关。问题出在 Delphi XE2 到 XE7 的默认内存管理器中（测试为 XE2 和试用 XE7），但问题首先出现在此类应用程序中。

我有在 Delphi XE2 中处理 JSON 数据的多线程 Win32/Win64 vcl 应用程序。

每个线程使用来自 DWScript 的 TdwsJSONValue.ParseString(sJSON) 解析 JSON 数据，使用 DWScript 方法读取值并将结果存储为记录。

出于测试目的，我在每个线程中处理相同的 JSON 数据。

单次线程运行在线程内需要 N 秒来处理数据。将线程数增加到M 线性（大约M * N）会增加处理相同数据所需的单个线程内的时间。

结果没有提高速度。此应用程序的其他部分（JSON 数据传输、在目标环境中存储结果） - 按预期扩展。

可能是什么原因？任何想法表示赞赏。

补充信息：

在 Win7/32 和 Win7/64、Win8/64 从 2 核到 12 核（w/w-out HT）系统上测试
DWScript 被选为最快的可用（测试了一堆，其中：Superobject，内置 Delphi）。 SO 的行为类似于来自 DWS 的 JSON 单元。
以下是说明问题的完整控制台应用程序。要运行它，我们需要此处提供的示例 json 数据：https://www.dropbox.com/s/4iuv87ytpcdugk6/json1.zip?dl=0 此文件包含第一个线程的数据 json1.dat。对于多达 16 个线程，只需将 json1.dat 复制到 json2.dat...json16.dat。

程序和数据应该在同一个文件夹中。运行：convert.exe N，其中 N 是线程数。

程序以毫秒为单位将执行时间写入 stout - 在线程中花费的时间、解析数据的时间和释放（销毁）TdwsJSONValue 对象的时间。声明 _dwsjvData.Destroy; 无法扩展。

program Convert;

{$APPTYPE CONSOLE}

{$R *.res}

uses
  System.SysUtils,
  System.Diagnostics,
  System.Classes,
  dwsJSON in 'dwsJSON.pas',
  dwsStrings in 'dwsStrings.pas',
  dwsUtils in 'dwsUtils.pas',
  dwsXPlatform in 'dwsXPlatform.pas';

type

  TWorkerThread = class (TThread)
  private
    _iUid:  Integer;
    _swWatch:  TStopwatch;
    _lRunning:  Boolean;

    _sFileJSonData:  String;
    _fJsonData:  TextFile;

  protected
    constructor Create (AUid: Integer);
    procedure Execute; override;

  published
    property Running: Boolean read _lRunning;

  end;

  TConverter = class (TObject)
  private
    _swWatch0, _swWatch1, _swWatch2:  TStopwatch;

    _dwsjvData:  TdwsJSONValue;

  protected
    constructor Create;
    destructor Destroy; override;

    function Calculate (AUid: Integer; AJSonData: String; var AParse, ADestroy: Integer): Integer;
  end;

const
  MAX_THREADS = 16;

var
  iHowMany:  Integer;
  athWorker:  array [1..MAX_THREADS] of Pointer;
  aiElapsed:  array [1..MAX_THREADS] of Integer;
  aiElapsedParse:  array [1..MAX_THREADS] of Integer;
  aiElapsedDestroy:  array [1..MAX_THREADS] of Integer;
  aiFares:  array [1..MAX_THREADS] of Integer;
  swWatchT, swWatchP:  TStopwatch;


constructor TWorkerThread.Create (AUid: Integer);
begin
  inherited Create (True);

  _iUid := AUid;
  _swWatch := TStopwatch.Create;
  _sFileJSonData := ExtractFilePath (ParamStr (0)) + 'json' + Trim (IntToStr (_iUid)) + '.dat';

  _lRunning := False;

  Suspended := False;
end;

procedure TWorkerThread.Execute;
var
  j:  Integer;
  sLine:  String;
  slLines:  TStringList;

  oS:  TConverter;
begin
  _lRunning := True;

  oS := TConverter.Create;

  slLines := TStringList.Create;
  System.AssignFile (_fJsonData, _sFileJSonData);
  System.Reset (_fJsonData);
  j := 0;
  repeat
    System.Readln (_fJsonData, sLine);
    slLines.Add (sLine);
    Inc (j);
  until (j = 50);
//  until (System.Eof (_fJsonData));
  System.Close (_fJsonData);

  Sleep (1000);

  _swWatch.Reset;
  _swWatch.Start;

  aiFares [_iUid] := 0;
  aiElapsedParse [_iUid] := 0;
  aiElapsedDestroy [_iUid] := 0;
  for j := 1 to slLines.Count do
    aiFares [_iUid] := aiFares [_iUid] + oS.Calculate (_iUid, slLines.Strings [j - 1], aiElapsedParse [_iUid], aiElapsedDestroy [_iUid]);

  _swWatch.Stop;

  slLines.Free;
  os.Destroy;

  aiElapsed [_iUid] := _swWatch.ElapsedMilliseconds;

  _lRunning := False;
end;

constructor TConverter.Create;
begin
  inherited Create;

  _swWatch0 := TStopwatch.Create;
  _swWatch1 := TStopwatch.Create;
  _swWatch2 := TStopwatch.Create;
end;

destructor TConverter.Destroy;
begin
  inherited;
end;

function TConverter.Calculate (AUid: Integer; AJSonData: String; var AParse, ADestroy: Integer): Integer;
var
  jFare, jTotalFares, iElapsedParse, iElapsedDestroy, iElapsedTotal:  Integer;
begin
  _swWatch0.Reset;
  _swWatch0.Start;

  _swWatch1.Reset;
  _swWatch1.Start;
  _dwsjvData := TdwsJSONValue.ParseString (AJSonData);
  _swWatch1.Stop;
  iElapsedParse := _swWatch1.ElapsedMilliseconds;

  if (_dwsjvData.ValueType = jvtArray) then
  begin
    _swWatch2.Reset;
    _swWatch2.Start;

    jTotalFares := _dwsjvData.ElementCount;
    for jFare := 0 to (jTotalFares - 1) do
      if (_dwsjvData.Elements [jFare].ValueType = jvtObject) then
      begin

        _swWatch1.Reset;
        _swWatch1.Start;

        _swWatch1.Stop;
      end;
  end;

  _swWatch1.Reset;
  _swWatch1.Start;
  _dwsjvData.Destroy;
  _swWatch1.Stop;
  iElapsedDestroy := _swWatch1.ElapsedMilliseconds;

  _swWatch0.Stop;
  iElapsedTotal := _swWatch0.ElapsedMilliseconds;

  Inc (AParse, iElapsedParse);
  Inc (ADestroy, iElapsedDestroy);

  result := jTotalFares;
end;

procedure MultithreadStart;
var
  j:  Integer;
begin
  for j := 1 to iHowMany do
    if (athWorker [j] = nil) then
    begin
      athWorker [j] := TWorkerThread.Create (j);

      TWorkerThread (athWorker [j]).FreeOnTerminate := False;
      TWorkerThread (athWorker [j]).Priority := tpNormal;
    end;
end;

procedure MultithreadStop;
var
  j:  Integer;
begin
  for j := 1 to MAX_THREADS do
    if (athWorker [j] <> nil) then
    begin
      TWorkerThread (athWorker [j]).Terminate;
      TWorkerThread (athWorker [j]).WaitFor;

      TWorkerThread (athWorker [j]).Free;
      athWorker [j] := nil;
    end;
end;

procedure Prologue;
var
  j:  Integer;
begin
  iHowMany := StrToInt (ParamStr (1));

  for j := 1 to MAX_THREADS do
    athWorker [j] := nil;

  swWatchT := TStopwatch.Create;
  swWatchT.Reset;

  swWatchP := TStopwatch.Create;
  swWatchP.Reset;
end;

procedure RunConvert;

  function __IsRunning: Boolean;
  var
    j:  Integer;
  begin
    result := False;
    for j := 1 to MAX_THREADS do
      result := result or ((athWorker [j] <> nil) and TWorkerThread (athWorker [j]).Running);
  end;

begin

  swWatchT.Start;

  MultithreadStart;

  Sleep (1000);
  while (__isRunning) do
    Sleep (500);

  MultithreadStop;

  swWatchT.Stop;
  Writeln (#13#10, 'Total time:', swWatchT.ElapsedMilliseconds);
end;

procedure Epilogue;
var
  j:  Integer;
begin
  for j := 1 to iHowMany do
    Writeln ( #13#10, 'Thread # ', j, '  tot.time:', aiElapsed [j], '  fares:', aiFares [j], '  tot.parse:', aiElapsedParse [j], '  tot.destroy:', aiElapsedDestroy [j]);

  Readln;
end;

begin
  try
    Prologue;
    RunConvert;
    Epilogue;

  except
    on E: Exception do
      Writeln (E.ClassName, ': ', E.Message);
  end;
end.

【问题讨论】：

没有代码，没有输入过程的描述（源代码），没有你的操作系统和CPU架构？这不是猜谜游戏！
您为此使用 DWScript 有什么特殊原因吗？应用程序使用的数据是否更多地使用 DWScript？
它显然不是真的在多线程上运行。使用调试器找出原因。当您的代码正在运行时，暂停它并查看所有线程的堆栈跟踪。这将向您显示哪些是活动的，哪些被阻止等待其他东西。

标签： json multithreading delphi dwscript

【解决方案1】：

您尝试过我的可扩展内存管理器吗？因为 Delphi（内部使用 fastmm）不能很好地处理字符串和其他与内存相关的东西： https://scalemm.googlecode.com/files/ScaleMM_v2_4_1.zip

您还可以尝试我的分析器的两种分析器模式，看看哪个部分是瓶颈： https://code.google.com/p/asmprofiler/

【讨论】：

还没有，但感谢您的提示。我会做。目前我正在检查英特尔的线程构建块。
我用 ScaleMM 进行了测试。结果更好 - 快 2 倍（8-16 个同时线程的平均速度提高）.dwsJSONValue.Destroy 非常快
但 dwsJSONValue.Parser 明显变慢了。

【解决方案2】：

我对 FastCode MM Challenge 进行了（重新）测试，结果对于 TBB 来说并不是那么好（在块缩减测试中也出现内存不足异常）。

简而言之：ScaleMM2 和 Google TCmalloc 在这个复杂的测试中是最快的，Fastmm 和 ScaleMM2 使用的内存最少。

Average Speed Performance: (Scaled so that the winner = 100%)
  XE6         :   70,4
  TCmalloc    :   89,1
  ScaleMem2   :  100,0
  TBBMem      :   77,8

Average Memory Performance: (Scaled so that the winner = 100%)
  XE6         :  100,0
  TCmalloc    :   29,6
  ScaleMem2   :   75,6
  TBBMem      :   38,4

FastCode 挑战：https://code.google.com/p/scalemm/source/browse/#svn%2Ftrunk%2FChallenge
待定 4.3：https://www.threadingbuildingblocks.org/download

【讨论】：

我需要在应用程序中进行良好的多线程扩展来处理多 TB 的 json 数据，这意味着大量的获取/释放内存。在测试中，英特尔的 TBB 具有最佳扩展性。我还没有测试 TCmalloc，但会做。
@YpsArthur 你确定吗？我测试了您的转换应用程序，SMM2 使用的内存更少，速度更快（我在 TWorkerThread.Execute 中添加了 i := 0 到 50）=> SMM2，4 个线程 = 8.5s，65mb 活动，22mb 空闲； TBBv4.3 = 9s，75mb 活动，73mb（！）空闲。但 TBB 似乎也相当不错:)
是的，TBB 是 16 线程的最佳解决方案，因为即使不是最快的，它也可以在我的生产环境中提供可预测的稳定扩展。内存消耗不是最重要的因素，速度（即使不是最快，但足够快）应该随着线程的增加而稳定。必要时，如果我有时间，我可以为不同的内存管理器提供一些带有测试程序输出的屏幕截图。
@YpsArthur 在我的测试中，TBB 4.4 在多线程 concat 测试中比 ScaleMM2 和 FastMM4 慢 6 倍。

【解决方案3】：

解决方案是将默认的 Delphi XE2 或 XE7 内存管理器与英特尔® 线程构建块内存管理器交换。在示例应用程序中，它可以缩放 ca。应用程序为 64 位时，线程数最多为 16 的线性。

update: with assumption that number of threads running is less than number of cores

这是在运行 KVM 虚拟化 Windows 7 和 124GB RAM 的 2cores/4ht 到 12cores/24ht 机器上测试的

有趣的是对 Win 7 进行虚拟化。内存分配和释放速度是本机 Win 7 的 2 倍。

结论：如果您在多线程（超过 4-8 个线程）应用程序的线程中执行大量 10kB-10MB 块的内存分配/释放操作 - 请仅使用英特尔的内存管理器。

@André：感谢您为我指明正确方向的提示！

这是用于测试的带有 TBB 内存管理器的单元，它必须在主项目文件 .dpr 的单元列表中显示为第一个

unit TBBMem;

interface

function  ScalableGetMem  (ASize: NativeInt): Pointer; cdecl; external 'tbbmalloc' name 'scalable_malloc';
procedure ScalableFreeMem (APtr: Pointer); cdecl; external 'tbbmalloc' name 'scalable_free';
function  ScalableReAlloc (APtr: Pointer; Size: NativeInt): Pointer; cdecl; external 'tbbmalloc' name 'scalable_realloc';

implementation

Function TBBGetMem (ASize: Integer): Pointer;
begin
  result := ScalableGetMem (ASize);
end;

Function TBBFreeMem (APtr: Pointer): Integer;
begin
  ScalableFreeMem (APtr);
  result := 0;
end;

Function TBBReAllocMem (APtr: Pointer; ASize: Integer): Pointer;
begin
  result := ScalableRealloc (APtr, ASize);
end;

const
  TBBMemoryManager:  TMemoryManager = ( GetMem: TBBGetmem;
                                        FreeMem: TBBFreeMem;
                                        ReAllocMem:  TBBReAllocMem; );
var
  oldMemoryManager:  TMemoryManager;

initialization
  GetMemoryManager (oldMemoryManager);
  SetMemoryManager (TBBMemoryManager);

finalization
  SetMemoryManager (oldMemoryManager);

end.

【讨论】：

您使用的是哪个版本的 TBB？你能把包括TBB在内的整个应用程序放在Dropbox上吗？
我对 FastCode MM Challenge 进行了（重新）测试，结果对于 TBB 来说并不是那么好（在块缩减测试中也出现内存不足异常）平均速度性能：（缩放以便获胜者 = 100%) D2010Mem : 70,4 TCmalloc : 89,1 ScaleMem2 : 100,0 TBBMem : 77,8 平均内存性能: (缩放以便获胜者 = 100%) D2010Mem : 100,0 TCmalloc : 29,6 ScaleMem2 : 75,6 TBBMem : 38,4
我拿了 tbbmalloc.dll 版本 4.3.2014.723，我不确定我是否可以将它放在保管箱上，但下载站点是 threadingbuildingblocks.org，完整代码在我的答案中
有快乐的结局——终于找到了满意的解决方案。我没有使用一个应用程序运行多个 (8 - 30) 分配/释放内存密集型线程，而是使用一个分配/释放内存密集型线程简化了应用程序，它以 8-32 个副本并行运行。在这种情况下缩放是最好的，应用程序本身并不复杂