【问题标题】:need a faster way to match these data sets需要一种更快的方法来匹配这些数据集
【发布时间】:2012-07-20 00:53:18
【问题描述】:

我有一组 Excel 工作表,每个设置如下:

ID | imageName
--------------
1    abc.jpg
2    def.bmp
3    abc.jpg
4    xyz123.jpg

此工作表对应一个文件夹,其内容如下:

abc.pdf
ghijkl.pdf
def.pdf
def.xls
x-abc.pdf

我正在尝试生成一个报告,该报告将每个 imageName 的实例与最低 ID 与与之匹配的 PDF 相匹配,并且还识别工作表中不匹配的 imageName 和文件夹中不匹配的 PDF。带有“x-”前缀的文件名等同于不带前缀的文件名,因此该数据集的报告如下:

ID  imageName   filename
-----------------------
1   abc.jpg     abc.pdf
1   abc.jpg     x-abc.pdf
2   def.bmp     def.pdf
4   xyz123.jpg 
                ghijkl.pdf

我目前的解决方案如下:

'sheetObj is the imageName set, folderName is the path to the file folder
sub makeReport(sheetObj as worksheet,folderName as string)

dim fso as new FileSystemObject
dim imageDict as Dictionary
dim fileArray as variant
dim ctr as long


'initializes fileArray for storing filename/imageName pairs
redim fileArray(1,0) 

'returns a Dictionary where key is imageName and value is lowest ID for that imageName
set imageDict=lowestDict(sheetObj)

'checks all files in folder and populates fileArray with their imageName matches
for each file in fso.getfolder(folderName).files
 fileFound=false
 'gets extension and checks if it's ".pdf"
 if isPDF(file.name) then 
  for each key in imageDict.keys
   'checks to see if base names are equal, accounting for "x-" prefix
   if equalNames(file.name,key) then 
    'adds a record to fileArray mapping filename to imageName
    addToFileArray fileArray,file.path,key  
    fileFound=true
   end if
  next
  'checks to see if filename did not match any dictionary entries
  if fileFound=false then 
   addToFileArray fileArray,file.path,""
  end if
 end if
next

'outputs report of imageDict entries and their matches (if any)
for each key in imageDict.keys
 fileFound=false
 'checks for all fileArray matches to this imageName
 for ctr=0 to ubound(fileArray,2)
  if fileArray(0,ctr)=key then
   fileFound=true
   'writes the data for this match to the worksheet
   outputToExcel sheetObj,key,imageDict(key),fileArray(0,ctr)
  end if
 next
 'checks to see if no fileArray match was found
 if fileFound=false then
  outputToExcel sheetObj,key,imageDict(key),""
 end if
next

'outputs unmatched fileArray entries
for ctr=0 to ubound(fileArray,2)
  if fileArray(1,ctr)="" then
   outputToExcel sheetObj,"","",fileArray(0,ctr)
  end if
next

此程序成功输出报告,但速度很慢。由于嵌套的 For 循环,随着 imageName 条目和文件数量的增加,处理它们的时间呈指数增长。

有没有更好的方法来检查这些集合中的匹配项?如果我将fileArray 放入字典可能会更快,但字典不能有重复的键,并且此数据结构需要在其字段中有重复的条目,因为文件名可能匹配多个图像名称,反之亦然。

【问题讨论】:

  • 实际上,它是几何地增长的,而不是指数增长的。

标签: vba excel


【解决方案1】:

这应该很快找到第一个。你可以在最后一个 if 语句的内部做任何你想做的事情。它使用 ADO 记录集,它应该比嵌套的 for 循环更快

Sub match()
Dim sheetName As String: sheetName = "Sheet1"
Dim rst As New ADODB.Recordset
Dim cnx As New ADODB.Connection
Dim cmd As New ADODB.Command

    'setup the connection
    '[HDR=Yes] means the Field names are in the first row
    With cnx
        .Provider = "Microsoft.Jet.OLEDB.4.0"
        .ConnectionString = "Data Source='" & ThisWorkbook.FullName & "'; " & "Extended Properties='Excel 8.0;HDR=Yes;IMEX=1'"
        .Open
    End With

    'setup the command
    Set cmd.ActiveConnection = cnx
    cmd.CommandType = adCmdText
    cmd.CommandText = "SELECT * FROM [" & sheetName & "$]"
    rst.CursorLocation = adUseClient
    rst.CursorType = adOpenDynamic
    rst.LockType = adLockOptimistic

    'open the connection
    rst.Open cmd

    Dim fso As FileSystemObject: Set fso = New FileSystemObject
    Dim filesInFolder As files, f As File
    Set filesInFolder = fso.GetFolder("C:\Users\Bradley\Downloads").files

    For Each f In filesInFolder
        rst.MoveFirst
        rst.Find "imageName = '" & f.Name & "'", , adSearchForward
        If Not rst.EOF Then
            Debug.Print rst("imagename") & "::" & rst("ID") '<-- Do what you need to do here
        End If
    Next f

End Sub

仅供参考:我引用了this 帖子

【讨论】:

    【解决方案2】:

    另一种方式。

    Sub Sample()
        Dim ws As Worksheet, wstemp As Worksheet
        Dim FileAr() As String
        Dim n As Long, wsLRow As Long
    
        Set ws = Sheets("Sheet1") '<~~ Which has imageNames   
        wsLRow = ws.Range("A" & ws.Rows.Count).End(xlUp).Row
    
        n = 0
    
        strFile = Dir("C:\Temp\*.*")
    
        Do While strFile <> ""
            n = n + 1
            ReDim Preserve FileAr(n)
    
            If Mid(strFile, Len(strFile) - 3, 1) = "." Then
                FileAr(n) = Mid(strFile, 1, Len(strFile) - 4)
            ElseIf Mid(strFile, Len(strFile) - 4, 1) = "." Then
                FileAr(n) = Mid(strFile, 1, Len(strFile) - 5)
            Else
                FileAr(n) = strFile
            End If
    
            strFile = Dir
        Loop
    
        Set wstemp = Worksheets.Add
        wstemp.Range("A1").Resize(UBound(FileAr) + 1, 1).Value = Application.Transpose(FileAr)
    
        ws.Range("B1:B" & wsLRow).Formula = "=IF(ISERROR(VLOOKUP(A1," & wstemp.Name & _
                                            "!A:A,1,0)),"""",VLOOKUP(A1," & wstemp.Name & "!A:A,1,0))"
    
        ws.Range("B1:B" & wsLRow).Value = ws.Range("B1:B" & wsLRow).Value
    
        Application.DisplayAlerts = False
        wstemp.Delete
        Application.DisplayAlerts = True
    End Sub
    

    【讨论】:

    • 好主意,避免嵌套循环使运行时间为O(ImageNames) + O(Files) 而不是O(ImageNames * Files),这对于大型数据集来说将是一个很大的改进。但是在OP中错过了几点。一些图像名称在文件中重复(例如abc.jpgx-abc.jpg),并且需要在输出中重复该行。并非所有文件都需要考虑(例如def.xls。不匹配的文件会添加到报告的末尾(例如ghijkl.pdf)。
    【解决方案3】:

    感谢您的回复。

    我最终通过在 folderName 中创建一个文件名数组来解决这个问题,使用 WinAPI FindFirstFileFindNextFile 函数遍历文件夹,因为它是通过网络进行的,因此遍历 @ 返回的集合987654324@ 太慢了。

    然后我从文件名数组中创建了一个文件名/基本名字典,如下所示:

    key         | value
    -----------------------
    abc.pdf     | abc
    x-lmnop.pdf | lmnop
    x-abc.pdf   | abc
    

    从这本字典中,我制作了一个反向字典fileConcat,它连接了来自重复基本名称的键,如下所示:

    key         | value
    -----------------------
    abc         | abc.pdf,x-abc.pdf
    lmnop       | lmnop.pdf
    

    然后我能够将每个 imageDict 键的基本名称与 fileConcat 中的一个键匹配,然后遍历由以下各项生成的串联值数组:

    split(fileConcat(key))

    其中keyimageDict 键的基本名称。

    正如@chrisneilsen 评论的那样,消除嵌套的 For 循环会将增长率降低到 O(ImageNames)+O(Files),并且该函数现在以令人满意的速度执行。

    【讨论】:

      猜你喜欢
      • 2020-10-11
      • 2020-04-17
      • 1970-01-01
      • 2011-09-13
      • 1970-01-01
      • 2016-02-10
      • 1970-01-01
      • 2017-12-12
      • 2013-09-28
      相关资源
      最近更新 更多