【问题标题】:How to speed up VBA to Extract data from Word to Excel如何加快 VBA 将数据从 Word 提取到 Excel
【发布时间】:2020-01-14 21:18:57
【问题描述】:

我是 stackoverflow 的新手,也是 VBA 编码的新手。在我的工作中,我们以 Ms Word 的形式提供了发货数据,这不是很有用。我找到了一种使用 VBA 传输数据的方法,并拥有一个功能齐全的代码。但是,数据集包含数十万条记录。我尝试用 20 万条记录运行一个月的数据,花了 5 天时间。只是想知道我的代码中是否有任何可以改进以加快进程的内容。我试过关闭屏幕更新、事件、计算,但效果不大。提前感谢您的帮助。

Sub Word_to_Excel()

Dim FName As String, FD As FileDialog
Dim wdApp As Object
Dim wdDoc As Object
Dim WDR, WDCheck, ShipmentID As Object
Dim ExR As Range
Dim file
Dim Path As String
Dim ImportDate As Object
Dim ImportValue As String
Dim ShipmentIDcheck As String
Dim objResult



    Set objShell = CreateObject("WScript.Shell")

    Set ExR = Selection ' current location in Excel Sheet

    ' Select Folder containing word documents
    Set FD = Application.FileDialog(msoFileDialogFolderPicker)
    FD.Show
    FName = FD.SelectedItems(1)

    file = Dir(FName & "\*.docx")

    Set wdApp = CreateObject("Word.Application")

    ' Open word document in the folder, run macro, close it and open the next word document until there are none left
    Do While file <> ""
    wdApp.Documents.Open Filename:=FName & "\" & file
    wdApp.ActiveWindow.ActivePane.View.Type = 1
    wdApp.Visible = True

    ' Once the word doc is open, go to beginning of document and search for CTY/SITE/SORT:
    wdApp.Selection.HomeKey Unit:=6
    wdApp.Selection.Find.ClearFormatting
    wdApp.Selection.Find.Execute "CTY/SITE/SORT:"
    Set WDCheck = wdApp.Selection

    ' If "CTY/SITE/SORT:" is found, then look for Shipment ID
    Do While WDCheck = "CTY/SITE/SORT:"

    ' Find first shipment
    wdApp.Selection.HomeKey Unit:=5
    wdApp.Selection.MoveDown Unit:=5, Count:=11
    wdApp.Selection.MoveRight Unit:=1, Count:=1
    wdApp.Selection.MoveRight Unit:=1, Count:=11, Extend:=1
    Set ShipmentID = wdApp.Selection
    ShipmentIDcheck = Replace(ShipmentID, " ", "")

    ' Transfer information from Word to Excel for a Shipment ID and go to the next one.
    ' Shipment ID should be a string that is 11 characters long
    ' If Shipment ID no longer exist, go to next page by searching for the next CTY/SITE/SORT:
    Do While Len(Trim(ShipmentIDcheck)) = 11
        i = i + 1
        ExR(i, 1) = file
        ExR(i, 2) = ShipmentIDcheck

    ' Consignee Name
    wdApp.Selection.MoveUp Unit:=5, Count:=1
    wdApp.Selection.MoveRight Unit:=1, Count:=12
    wdApp.Selection.MoveRight Unit:=1, Count:=23, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 3) = Trim(WDR)

    ' Importer Name
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=23, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 8) = Trim(WDR)

    ' Shipper Name
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=23, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 13) = Trim(WDR)

    ' Quantity
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=10, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 19) = Trim(WDR)

    ' Weight
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=12, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 20) = Trim(WDR)

    ' Value
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=12, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 21) = Trim(WDR)

    ' Broker
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=11, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 23) = Trim(WDR)

    ' Consignee Street
    wdApp.Selection.HomeKey Unit:=5
    wdApp.Selection.MoveDown Unit:=5, Count:=1
    wdApp.Selection.MoveRight Unit:=1, Count:=13
    wdApp.Selection.MoveRight Unit:=1, Count:=23, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 4) = Trim(WDR)

    ' Importer Street
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=23, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 9) = Trim(WDR)

    ' Shipper Street
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=23, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 14) = Trim(WDR)

    ' Description
    wdApp.Selection.MoveRight Unit:=1, Count:=8
    wdApp.Selection.MoveRight Unit:=1, Count:=40, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 18) = Trim(WDR)

    ' Consignee City
    wdApp.Selection.HomeKey Unit:=5
    wdApp.Selection.MoveDown Unit:=5, Count:=1
    wdApp.Selection.MoveRight Unit:=1, Count:=13
    wdApp.Selection.MoveRight Unit:=1, Count:=13, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 5) = Trim(WDR)

    ' Consignee Province
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=2, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 6) = Trim(WDR)

    ' Consignee Postal
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=6, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 7) = Trim(WDR)

    ' Importer City
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=13, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 10) = Trim(WDR)

    ' Importer Province
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=2, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 11) = Trim(WDR)

    ' Importer Postal
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=6, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 12) = Trim(WDR)

    ' Shipper City
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=13, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 15) = Trim(WDR)

    ' Shipper Province
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=2, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 16) = Trim(WDR)

    ' Shipper Postal
    wdApp.Selection.MoveRight Unit:=1, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=6, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 17) = Trim(WDR)

    ' Country of Origin
    wdApp.Selection.MoveRight Unit:=1, Count:=29
    wdApp.Selection.MoveRight Unit:=1, Count:=21, Extend:=1
    Set WDR = wdApp.Selection
    ExR(i, 22) = Trim(WDR)

    wdApp.Selection.HomeKey Unit:=5
    wdApp.Selection.MoveDown Unit:=5, Count:=2
    wdApp.Selection.MoveRight Unit:=1, Count:=1
    wdApp.Selection.MoveRight Unit:=1, Count:=11, Extend:=1
    Set ShipmentID = wdApp.Selection
    ' Remove spaces from selection. Selection is then used to check if it is a shipment ID.
    ' If it is, then data for that shipment ID is transferred. If not, macro will go to the next page in the Word Doc.
    ShipmentIDcheck = Replace(ShipmentID, " ", "")

    ActiveCell.Offset(1).Select

    Loop

    'Simulate keyboard press "NUMLOCK" to prevent screen from locking
    objResult = objShell.SendKeys("{NUMLOCK}")

    wdApp.Selection.HomeKey Unit:=5
    wdApp.Selection.Find.ClearFormatting
    wdApp.Selection.Find.Execute "CTY/SITE/SORT:"
    Set WDCheck = wdApp.Selection

    Loop

    wdApp.ActiveDocument.Close SaveChanges:=wdDoNotSaveChanges
    ActiveWorkbook.Save
    file = Dir()

    Loop

    wdApp.Quit

    MsgBox "Data extraction completed at:" & vbNewLine & Format(Now, "mmmm d, yyyy hh:mm AM/PM")

End Sub

这就是数据集的格式。每天有多个 word 文档包含该数据集的页面和页面。每页的发货数量各不相同。但格式始终相同。 word文档中没有表格,只有用空格分隔的文本。 CTY/SITE/SORT:每个页面都是独一无二的,我用它作为锚点。如果宏找到它,那么它会向下 11 行并获取第一个货件 ID 和其他信息。然后它会检查下一个货件 ID。如果不存在,则转到下一页并重复该过程。

REPORT NUM   : ABC1234                                   OPERATIONS SYSTEM                                       PAGE NUM:   2      
 CTY/SITE/SORT: CA 00123                                    SUMMARY CARGO                                         RUN TIME: 07:33:43 
 SORT DATE    :                                            INBOUND - SCAN                                         RUN DATE: 01AUG19  
                                                                                                                  OPER ID : ABC123  
 MVMT: 12345678   MVMT DT: 01AUG19    MAWB:                  PROD TYP:      DTY TYP:      IMP CTY:      EXP CTY:      BL TYP:        
                        COURIER REMISSION  MANIFEST               EXPORT SITE: US 12345                                

 GCCN ID:               EXPECTED SHPTS:           EXPECTED PKGS:             EXPECTED WEIGHT:                                        

 CUSTOMS NUM     CONSIGNEE NAME           IMPORTER NAME           SHIPPER NAME        CSA    QTY     WGT(LBS)   VALUE  BROKER        
 SHIPMENT ID                                                                               DESCRIPTION           (CAD) CTRY OF ORIGIN
             JOHN SMITH              ABC COMPANY             XYZ COMPANY                      1          1.1      1000.00 UNCONSIGNED
 ABC12345678 123 MAIN STREET         345 RANDOM ROAD         UNIVERSITY OF WASHINGTO       BICYCLE PARTS                             
             VANCOUVER     BC V1A1A1 VANCOUVER     BC V2B1B2 SEATTLE       WA 981234                            US                   
             JOHN SMITH              ABC COMPANY             XYZ COMPANY                      1          1.1      1000.00 UNCONSIGNED
 ABC12345678 123 MAIN STREET         345 RANDOM ROAD         UNIVERSITY OF WASHINGTO       BICYCLE PARTS                             
             VANCOUVER     BC V1A1A1 VANCOUVER     BC V2B1B2 SEATTLE       WA 981234                            US                   
             JOHN SMITH              ABC COMPANY             XYZ COMPANY                      1          1.1      1000.00 UNCONSIGNED
 ABC12345678 123 MAIN STREET         345 RANDOM ROAD         UNIVERSITY OF WASHINGTO       BICYCLE PARTS                             
             VANCOUVER     BC V1A1A1 VANCOUVER     BC V2B1B2 SEATTLE       WA 981234                            US                   
             JOHN SMITH              ABC COMPANY             XYZ COMPANY                      1          1.1      1000.00 UNCONSIGNED
 ABC12345678 123 MAIN STREET         345 RANDOM ROAD         UNIVERSITY OF WASHINGTO       BICYCLE PARTS                             
             VANCOUVER     BC V1A1A1 VANCOUVER     BC V2B1B2 SEATTLE       WA 981234                            US                   
       TOTAL FOR DUTY TYPE COURIER REMISSION                                                                                         
       TOTAL SHIPMENTS:                      4                                                                                       
       TOTAL PACKAGES :                      4                                                                                       
       TOTAL WEIGHT   :                     70.9 LBS                                                                                 
       TOTAL VALUES   :                   4000.00                                                                                         
* * *                                      

我使用以下代码清理数据集并将它们排列为每行一条记录,每行由一个段落分隔(谢谢,macropod)。由于数据被排列成由空格分隔的列,因此我可以将其保存为 .txt 文件并将其导入到 Excel 中。现在的挑战是将代码应用于文件夹中的所有文档,并为每个文档生成一个 .txt 文件。或者如果代码可以将清理后的 .docx 文件中的所有数据合并到一个 .txt 文件中,那就更好了。

Sub CleanWordDoc()
Application.ScreenUpdating = False
Dim p As Long, StrOut As String

With ActiveDocument.Range
  With .Find
    .ClearFormatting
    .Replacement.ClearFormatting
    .Wrap = wdFindContinue
    .Forward = True
    .Format = False
    .MatchWildcards = True
    .Text = "REPORT NUM   : * CTRY OF ORIGIN^13" 'Clean header on each page
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "[ ]{2,}ACTUAL SHP TOTAL*[ ]{20,}^13^m" 'Clean footer on some pages
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "[ ]{2,}TOTAL FOR DUTY*[ ]{20,}^13^m" 'Clean more footers
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "REPORT NUM   :*SUMMARY*[\*] [\*][ ]{20,}^13" 'Clean last page
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "^m^13" ' Clean all page breaks
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "^13^13" ' Clean empty paragraphs
    .Replacement.Text = "^13"
    .Execute Replace:=wdReplaceAll
    .Text = "<[ ]{1,}^13" ' Clean spaces and paragraphs at the beginning of doc
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "(*)^13(*)^13(*^13)" ' Combine 3 paragraphs into one
    .Replacement.Text = "\1 \2 \3"
    .Execute Replace:=wdReplaceAll
  End With

End With
Application.ScreenUpdating = True
End Sub

这就是清理后的 .docx 的样子(有数百条记录):

 12345678900 ABC COMPANY             DEF COMPANY             XYZ COMPANY                      1          1.1       123.45 AAABROKER    A0B12345LFD ABC ADDRESS             DEF ADDRESS             XYZ ADDRESS                   BICYCLE PARTS                                           VANCOUVER     BC V1A1A1 MARKHAM       ON L1L1L1 SHENZHEN         512323                            CN                   
 98765432100 ABC COMPANY             DEF COMPANY             XYZ COMPANY                      1          1.1       123.45 AAABROKER    A0B12345LFD ABC ADDRESS             DEF ADDRESS             XYZ ADDRESS                   BICYCLE PARTS                                           VANCOUVER     BC V1A1A1 MARKHAM       ON L1L1L1 SHENZHEN         512323                            CN                   

【问题讨论】:

  • 如果你的代码可以运行但速度很慢,那么它更适合Code Review
  • 好的,我会在那里发帖。谢谢
  • 问题是您在循环的每次迭代中多次写入 Excel。建议您使用数组来收集每个循环迭代中的信息 (Redim Preserve) 以累积每一位信息。然后,您可以一次性写入信息。使用具有范围的数组进行研究。
  • 如果不知道(例如)您的数据被拆分了多少个文件,或者数据在 Word 中的确切格式,很难提出有用的建议。对于这个过程来说,5 天肯定太长了。
  • 我添加了一个关于如何在 word 中格式化数据的示例。在一个月内,数据通常被拆分为 100 多个 word 文档,每个 word 文档可以包含 500 页数据。 SmileyFtW,我将研究使用数组和范围。谢谢。

标签: excel vba ms-word


【解决方案1】:

您的代码很慢,因为您正在驱动 Word 分析您的数据。将其作为纯文本处理会快得多

我采取的方法是说服您的数据提供者将其作为文本文件提供。如果不行,那就写一个 VBA 程序,把每个 Word 文件转换成文本。

完成后,使用简单的文本文件处理从文件中读取每一行,分析所需数据并将其提取到变量数组中,然后将结果写入 Excel。

注意:我没有包含完整的代码来分析和提取您的数据,我将把它留给您。包括一个小型 sn-p 以帮助您入门。

类似的东西

Option Explicit

Sub Demo()
    Dim t1 As Single, t2 As Single
    Dim DataFile As String
    Dim DataPath As String
    Dim SavePath As String
    Dim rw As Long
    Dim ws As Worksheet
    Dim WordApp As Word.Application

    On Error GoTo EH

    'identify sheet to take results
    Set ws = ActiveSheet

    t1 = Timer() '<~~ only used to report run time

    ' Create an instance of Word
    Set WordApp = New Word.Application
    WordApp.Visible = False

    ' Set up path to data files
    DataPath = "C:\Data\Temp\SO\" '<~~ update to suit
    SavePath = DataPath & "Txt\" '<~~ optional: save text files to a seperate subfolder

    ' Get first word file in directory
    DataFile = Dir(DataPath & "*.docx")
    Do While DataFile <> vbNullString
        Debug.Print "Convert ", DataFile
        ' Open in word, save as text
        ConvertToText WordApp, DataPath, DataFile, SavePath
        DoEvents

        ' Get next file
        DataFile = Dir
    Loop

    ' Tidy up
    WordApp.Quit
    Set WordApp = Nothing

    t2 = Timer

    Debug.Print "Convert Time", t2 - t1


    t1 = Timer()
    ' Get first text file in directory
    DataFile = Dir(SavePath & "*.txt")
    rw = 1
    Do While DataFile <> vbNullString
        Debug.Print "Read ", DataFile
        ' process the file
        ReadFile ws, SavePath, DataFile, rw
        DoEvents
        ' Get next file
        DataFile = Dir
    Loop


    t2 = Timer

    Debug.Print "Read Time", t2 - t1

Exit Sub
EH:
    On Error Resume Next
    ' Tidy up
    If Not WordApp Is Nothing Then WordApp.Quit
    Set WordApp = Nothing

End Sub

Sub ConvertToText(WordApp As Word.Application, ByVal FilePath As String, ByVal FileName As String, ByVal SavePath As String)
    Dim WordDoc As Word.Document
    Dim i As Long

    ' ensure file is closed if Sub errors
    On Error GoTo EH

    ' Open the file
    Set WordDoc = WordApp.Documents.Open(FilePath & FileName)

    ' generate Text file name
    i = InStrRev(FileName, ".")
    FileName = Left$(FileName, i) & "txt"

    ' Save as text
    WordDoc.SaveAs2 _
        FileName:=SavePath & FileName, _
        FileFormat:=wdFormatText, _
        AddToRecentFiles:=False, _
        EmbedTrueTypeFonts:=False, _
        SaveNativePictureFormat:=False, _
        SaveFormsData:=False, _
        SaveAsAOCELetter:=False, _
        Encoding:=1252, _
        InsertLineBreaks:=False, _
        AllowSubstitutions:=False, _
        LineEnding:=0, _
        CompatibilityMode:=0

EH:
    On Error Resume Next
    ' Close file
    WordDoc.Close False

End Sub

Sub ReadFile(ws As Worksheet, FilePath As String, FileName As String, ByRef rw)
    'parse text file
    Dim Ln As String
    Dim FileNum As Integer

    Dim ExtractedData() As Variant
    Dim idx As Long

    ' ensure file is closed if Sub errors
    On Error GoTo EH

    ' Text file handling
    FileNum = FreeFile
    Open FilePath & FileName For Input As FileNum

    ' Restults array.
    ReDim ExtractedData(1 To 1000000, 1 To 1) ' Excel sheet can hold at most 1048576 rows
    idx = 0
    Do While Not EOF(FileNum)
        ' Read a line from file
        Line Input #FileNum, Ln

        ' Add your code to extract required data here
        If Ln Like " [A-Z][A-Z][A-z]########*" Then
            idx = idx + 1
            ExtractedData(idx, 1) = Ln
        End If
        '============================================
    Loop
    ' Place extracted data onto sheet
    ws.Cells(rw, 1).Resize(idx, 1) = ExtractedData
    ' Update row num for next file
    rw = rw + idx

EH:
    On Error Resume Next
    ' Clean Up
    Close #FileNum
End Sub



【讨论】:

  • 嗨,克里斯,这个代码就像一个魅力。一个问题,在将其保存为 .txt 之前,如何将查找/替换代码(以清除页眉和页脚)合并到 .docx 文件中?
  • @buroughs 我需要更多关于之前和之后的信息来回答这个问题
  • 这是我在上面发布的 Sub CleanWordDoc。如果我将它插入 Word 中的模块,它可以工作,但我无法弄清楚如何在 Excel 中使其工作。
  • 我能弄明白。之前没有用,因为我没有打开 Excel 中对 MsWord 的引用。非常感谢您的帮助。现在转换数据只需几秒钟。
【解决方案2】:

我不清楚“CTY/SITE/SORT:”在哪里或如何与您正在做的事情相关,因为它没有出现在您发布的数据 sn-p 中。下面的 Word 宏显示了如何解析仅包含您发布的数据 sn-p 中的数据的文档。按照编码,它仅在文档末尾输出第一个此类记录 - 为整个文档生成输出所需的代码已被注释掉。代码中的注释显示了输出的结构。将除最后一个之外的所有 vbCr 实例替换为 vbTab 即可将每条记录的输出转换为 Excel 的单行。

有关处理整个 Word 文档文件夹的 Excel 驱动代码,请参阅例如:https://www.excelguru.ca/forums/showthread.php?8900-Help-with-VBA-to-extract-data-from-Word-to-Excel&p=36586&viewfull=1#post36586。正如您将看到的,没有必要使用 Selection - 这会对性能造成重大影响。

Sub Demo()
Application.ScreenUpdating = False
Dim p As Long, StrOut As String
With ActiveDocument.Range
  With .Find
    .ClearFormatting
    .Replacement.ClearFormatting
    .Wrap = wdFindContinue
    .Forward = True
    .Format = False
    .MatchWildcards = True
    .Text = "^13[!^13]@^13 <[A-Z]{3}[0-9]{8}"
    .Replacement.Text = "^p^&"
    .Execute Replace:=wdReplaceAll
    .Text = "REPORT NUM * CTRY OF ORIGIN^13"
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "^13[ ]@TOTAL FOR DUTY * TOTAL VALUES[!^13]@^13*^13"
    .Replacement.Text = "^p"
    .Execute Replace:=wdReplaceAll
    .Text = "[ ]{2,}"
    .Replacement.Text = "^t"
    .Execute Replace:=wdReplaceAll
    .Text = "(^t[A-Z]{2}) ([A-Z0-9]{5,})[ ^t]"
    .Replacement.Text = "\1^t\2^t"
    .Execute Replace:=wdReplaceAll
    .Text = "([0-9]{1,}.[0-9]{2}) "
    .Replacement.Text = "\1^t"
    .Execute Replace:=wdReplaceAll
    .Text = "^13 (<[A-Z]{3}[0-9]{8}) "
    .Replacement.Text = "^p\1^t"
    .Execute Replace:=wdReplaceAll
    .Text = "^13"
    .Replacement.Text = "^l"
    .Execute Replace:=wdReplaceAll
    .Text = "[^l]{2,}"
    .Replacement.Text = "^p"
    .Execute Replace:=wdReplaceAll
    .Text = "^l"
    .Replacement.Text = "^t"
    .Execute Replace:=wdReplaceAll
  End With
  For p = 2 To .Paragraphs.Count - 1
    With .Paragraphs(p).Range
      'StrOut =
      'Shipment ID, Description, Quantity, Weight, Value, Broker, Country of Origin
      'Consignee Name, Consignee Street, Consignee City, Consignee State, Consignee Zip,
      'Importer Name, Importer Street, Importer City, Importer State, Importer Zip,
      'Shipper Name, Shipper Street, Shipper City, Shipper State, Shipper Zip,
      StrOut = StrOut & Split(.Text, vbTab)(8) & vbTab & Split(.Text, vbTab)(12) & vbTab & Split(.Text, vbTab)(4) & vbTab & Split(.Text, vbTab)(5) & vbTab & Split(.Text, vbTab)(6) & vbTab & Split(.Text, vbTab)(7) & vbTab & Split(.Text, vbTab)(24) & vbCr & _
        Split(.Text, vbTab)(1) & vbTab & Split(.Text, vbTab)(9) & vbTab & Split(.Text, vbTab)(15) & vbTab & Split(.Text, vbTab)(16) & vbTab & Split(.Text, vbTab)(17) & vbCr & _
        Split(.Text, vbTab)(2) & vbTab & Split(.Text, vbTab)(10) & vbTab & Split(.Text, vbTab)(18) & vbTab & Split(.Text, vbTab)(19) & vbTab & Split(.Text, vbTab)(20) & vbCr & _
        Split(.Text, vbTab)(3) & vbTab & Split(.Text, vbTab)(11) & vbTab & Split(.Text, vbTab)(21) & vbTab & Split(.Text, vbTab)(22) & vbTab & Split(.Text, vbTab)(23) & vbCr
    End With
  Next
  'Instead of .InsertAfter, write StrOut to Excel
  .InsertAfter vbCr & StrOut
End With
Application.ScreenUpdating = True
End Sub

要填充工作表,您可能会使用以下内容:

Dim StrRow As String, lRow As Long, r As Long, c As Long
With ActiveSheet
  lRow = .UsedRange.Cells.SpecialCells(xlCellTypeLastCell).Row + 1
  For r = 0 To UBound(Split(StrOut, vbCr))
    StrRow = Split(StrOut, vbCr)(r)
    For c = 0 To UBound(Split(StrRow, vbTab))
      .Cells(r + lRow, c + 1).Value = Split(StrRow, vbTab)(c)
    Next
  Next
End With

【讨论】:

  • macropod,我很抱歉。我已经更新了数据 sn-p。
  • 我已经更新了代码,在 Word 中添加了更多查找/替换操作,以考虑您的页面/数据页眉和页脚。
  • 这是一个有趣的代码,我知道它有什么用处。不过,我很难理解和剖析 .Text 和 .Replacement.Text 表达式。并且代码也不会转到 .docx 文件中的下一页。想法是最终将修剪后的数据转换为 .csv 吗?
  • 代码使用 Word 的 wildcard 查找/替换工具进行初始解析。如果您查看文档顶部的剩余内容,您将了解它在这方面所做的工作。您是否查看了代码发送到文档末尾的输出?这基本上就是它要输出到 Excel 的内容。
  • excelforum.com/excel-programming-vba-macros/… 上交叉发布(甚至不承认此处提供的帮助)。发帖礼仪请阅读:excelguru.ca/content.php?184
【解决方案3】:

问题解决了。感谢@chris neilsen 和@macropod 的帮助。

这是我使用的完成代码,它能够在几分钟而不是几天内处理数据:

Option Explicit

Sub ConvertWordtoExcel()
    Dim t1 As Single, t2 As Single
    Dim DataFile As String
    Dim DataPath As String
    Dim SavePath As String
    Dim SavePathFolder As String
    Dim rw As Long
    Dim ws As Worksheet
    Dim WordApp As Word.Application
    Dim FD As FileDialog

    On Error GoTo EH

    'identify sheet to take results
    Set ws = ActiveSheet

    t1 = Timer() '<~~ only used to report run time

    ' Create an instance of Word
    Set WordApp = New Word.Application
    WordApp.Visible = False

    ' Set up path to data files
    Set FD = Application.FileDialog(msoFileDialogFolderPicker) 'Open Folder Picker
    FD.Show
    DataPath = FD.SelectedItems(1) & "\"
    Debug.Print "Folder", DataPath
    SavePath = DataPath & "Txt\" '<~~ save text files to a separate subfolder called Txt
    SavePathFolder = Dir(SavePath, vbDirectory) ' If the Txt subfolder does not exist, create it
        If SavePathFolder = vbNullString Then
            VBA.FileSystem.MkDir (SavePath)
        End If

    ' Get first word file in directory
    DataFile = Dir(DataPath & "*.docx")
    Do While DataFile <> vbNullString
        Debug.Print "Convert ", DataFile
        ' Open in word, save as text
        ConvertToText WordApp, DataPath, DataFile, SavePath
        DoEvents

        ' Get next file
        DataFile = Dir
    Loop

    ' Tidy up
    WordApp.Quit
    Set WordApp = Nothing

    t2 = Timer

    Debug.Print "Convert Time", t2 - t1


    t1 = Timer()
    ' Get first text file in directory
    DataFile = Dir(SavePath & "*.txt")
    rw = 1
    Do While DataFile <> vbNullString
        Debug.Print "Read ", DataFile
        ' process the file
        ReadFile ws, SavePath, DataFile, rw
        DoEvents
        ' Get next file
        DataFile = Dir
    Loop


    t2 = Timer

    Debug.Print "Read Time", t2 - t1

Exit Sub
EH:
    On Error Resume Next
    ' Tidy up
    If Not WordApp Is Nothing Then WordApp.Quit
    Set WordApp = Nothing

End Sub

Sub ConvertToText(WordApp As Word.Application, ByVal FilePath As String, ByVal FileName As String, ByVal SavePath As String)
    Dim WordDoc As Word.Document
    Dim i As Long

    ' ensure file is closed if Sub errors
    On Error GoTo EH

    ' Open the file
    Set WordDoc = WordApp.Documents.Open(FilePath & FileName)

    With WordDoc.Range
  With .Find
    .ClearFormatting
    .Replacement.ClearFormatting
    .Wrap = wdFindContinue
    .Forward = True
    .Format = False
    .MatchWildcards = True
    .Text = "[ ]{2,}[^13]{1,}(REPORT NUM   :)" 'Clean header on each page
    .Replacement.Text = "\1"
    .Execute Replace:=wdReplaceAll
    .Text = "REPORT NUM   : * CTRY OF ORIGIN^13" 'Clean header on each page
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "[ ]{2,}ACTUAL SHP TOTAL*[ ]{20,}^13^m" 'Clean footer on some pages
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "[ ]{2,}TOTAL FOR DUTY*[ ]{20,}^13^m" 'Clean more footers
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "REPORT NUM   :*SUMMARY*[\*] [\*][ ]{20,}[^13]{1,}" 'Clean last page
    .Replacement.Text = ""
    .Execute Replace:=wdReplaceAll
    .Text = "^13^m" ' Clean all page breaks
    .Replacement.Text = "^13"
    .Execute Replace:=wdReplaceAll
    .Text = "[^13]{2,}" ' Clean empty paragraphs
    .Replacement.Text = "^13"
    .Execute Replace:=wdReplaceAll
    .Text = "(*)^13(*)^13(*)^13" ' Combine 3 paragraphs into one and add file name at the end
    .Replacement.Text = "\1 \2 \3 " + FileName + "^13"
    .Execute Replace:=wdReplaceAll
  End With
  End With


    ' generate Text file name
    i = InStrRev(FileName, ".")
    FileName = Left$(FileName, i) & "txt"

    ' Save as text
    WordDoc.SaveAs2 _
        FileName:=SavePath & FileName, _
        FileFormat:=wdFormatText, _
        AddToRecentFiles:=False, _
        EmbedTrueTypeFonts:=False, _
        SaveNativePictureFormat:=False, _
        SaveFormsData:=False, _
        SaveAsAOCELetter:=False, _
        Encoding:=1252, _
        InsertLineBreaks:=False, _
        AllowSubstitutions:=False, _
        LineEnding:=0, _
        CompatibilityMode:=0

EH:
    On Error Resume Next
    ' Close file
    WordDoc.Close False

End Sub

Sub ReadFile(ws As Worksheet, FilePath As String, FileName As String, ByRef rw)
    'parse text file
    Dim Ln As String
    Dim FileNum As Integer

    Dim ExtractedData() As Variant
    Dim idx As Long

    ' ensure file is closed if Sub errors
    On Error GoTo EH

    ' Text file handling
    FileNum = FreeFile
    Open FilePath & FileName For Input As FileNum

    ' Restults array.
    ReDim ExtractedData(1 To 1000000, 1 To 1) ' Excel sheet can hold at most 1048576 rows
    idx = 0
    Do While Not EOF(FileNum)
        ' Read a line from file
        Line Input #FileNum, Ln

        ' Add your code to extract required data here
        'If Ln Like " [A-Z][A-Z][A-z]########*" Then
            If Ln Like " *" Then
            idx = idx + 1
            ExtractedData(idx, 1) = Ln
        End If
        'End If
        '============================================
    Loop
    ' Place extracted data onto sheet
    ws.Cells(rw, 1).Resize(idx, 1) = ExtractedData
    ' Update row num for next file
    rw = rw + idx

EH:
    On Error Resume Next
    ' Clean Up
    Close #FileNum
End Sub

【讨论】:

    猜你喜欢
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2017-07-31
    • 1970-01-01
    相关资源
    最近更新 更多