【问题标题】:Extract numbers from chemical formula - including multiple elements within parentheses从化学式中提取数字 - 包括括号内的多个元素
【发布时间】:2022-01-16 07:43:21
【问题描述】:

我在电子表格中有数千个化学式的列表,我想计算每个化学元素在每个化学式中出现的次数。这里给出了一些例子:

  1. CH3NO3
  2. CSe2
  3. C2Cl2
  4. C2Cl2O2
  5. C4H6COOH
  6. (C6H5)2P(CH2)6P(C6H5)2

我发现@PEH (Extract numbers from chemical formula) 的一些代码运行良好。但是,在提取数千个值时会变得非常慢。因此,我创建了一个数组版本(见下文),并设法通过其他用户的一些输入来加快速度(How to speed up extracting numbers from chemical formula)。它有效并且确实加快了速度。但是,我还需要它在括号内找到多个元素(下面的代码目前不适用于上面的化学式 6 - 应该是 30 C、32 H、2 P)。我希望有人能够提出一种基于下面的正则表达式方法来实现这一目标的方法。原始 ChemRegex 中有一些代码可以执行此操作(https://stackoverflow.com/a/46091904/17194644),但我无法使其在子中工作 - 如果我尝试将其包含在子中,则会出现此错误:

Private RegEx As RegExp

Function CountElements(ChemFormulaRange As Variant, ElementRange As Variant) As Variant

'define variables
Dim RetValRange() As Long
Dim RetVal As Long
Dim ChemFormula As String
Dim npoints As Long
Dim i As Long
Dim mpoints As Long
Dim j As Long

' Connvert input ranges to variant arrays
If TypeName(ChemFormulaRange) = "Range" Then ChemFormulaRange = ChemFormulaRange.Value
If TypeName(ElementRange) = "Range" Then ElementRange = ElementRange.Value

'parameter
npoints = UBound(ChemFormulaRange, 1) - LBound(ChemFormulaRange, 1) + 1
mpoints = UBound(ElementRange, 2) - LBound(ElementRange, 2) + 1

'dimension array
ReDim RetValRange(1 To npoints, 1 To mpoints)

If RegEx Is Nothing Then
    Set RegEx = New RegExp
    ' apply the properties
End If

'calculate all values
For j = 1 To mpoints
    Element = ElementRange(1, j)
        For i = 1 To npoints
        RetVal = 0
        ChemFormula = ChemFormulaRange(i, 1)
            Call ChemRegex(ChemFormula, Element, RetVal, RegEx)
        RetValRange(i, j) = RetVal
        Next i
Next j

'output answer
CountElements = RetValRange

End Function
Private Sub ChemRegex(ChemFormula, Element, RetVal, RegEx)
    
'ChemRegex created by PEH (CC BY-SA 4.0) https://stackoverflow.com/a/46091904/17194644
    
    With RegEx
        .Global = True
        .MultiLine = True
        .IgnoreCase = False
    End With
    
    'first pattern matches every element once
    RegEx.Pattern = "([A][cglmrstu]|[B][aehikr]?|[C][adeflmnorsu]?|[D][bsy]|[E][rsu]|[F][elmr]?|[G][ade]|[H][efgos]?|[I][nr]?|[K][r]?|[L][airuv]|[M][cdgnot]|[N][abdehiop]?|[O][gs]?|[P][abdmortu]?|[R][abefghnu]|[S][bcegimnr]?|[T][abcehilms]|[U]|[V]|[W]|[X][e]|[Y][b]?|[Z][nr])([0-9]*)"
    
    Dim Matches As MatchCollection
    Set Matches = RegEx.Execute(ChemFormula)
    
    Dim m As Match
    For Each m In Matches
        If m.SubMatches(0) = Element Then
            RetVal = RetVal + IIf(Not m.SubMatches(1) = vbNullString, m.SubMatches(1), 1)
        End If
    Next m
    
End Sub

【问题讨论】:

  • 将 1000 的原始电子表格拆分为多张,每张 1000。然后复制粘贴特殊值并组合结果。
  • 被调用的函数 (ChemRegEx) 看起来需要四个参数(ChemFormula、Element、RetVal、RegEx),但您只提供了前两个参数,这将引发您看到的错误。
  • 这个问题归结为调试您对ChemRegEx 的调用。剩下的就是噪音。我建议编辑 Q 以专注于此,包括指向函数源的链接,然后删除其余部分。顺便说一句,bugdrown 已经为您提供了您需要知道的内容
  • 我已经回答了一个关于在 PHP 中展平/扩展化学方程式的类似问题,这可能对简单的括号公式有用:stackoverflow.com/a/20672549/406712

标签: arrays excel regex vba


【解决方案1】:

看起来您调整了代码以重用我上次建议的 RegExp 对象,实际上我希望这会大大提高性能。但是我应该更好地解释如何实现,但请参阅下面的示例。

在示例中,我还采用了您的第二个 RegExp 模式,但对其余部分进行了重新设计。这个例子似乎适用于你的样本数据,但这就是我测试过的全部!

Option Explicit    
Private regEx As RegExp
Private regEx2 As RegExp

Sub Test()
' formulas in A2:A7 and elements in B1:H1 (see OP's screenshot), return results in B2:H7
    Range("B2:H7").Value = CountElements(Range("A2:A7").Value, Range("B1:H1"))
End Sub

Function CountElements(ChemFormulaRange As Variant, ElementRange As Variant) As Variant
Dim RetValRange() As Long
Dim RetVal As Long
Dim ChemFormula As String
Dim i As Long, j As Long
Dim mpoints As Long, npoints As Long
Dim Element As String

    If regEx Is Nothing Then
        Set regEx = New RegExp
        With regEx
            .Global = True
            '.MultiLine = True ' ? only if working with multilines
            .IgnoreCase = False
            
            'first pattern matches every element once
            .Pattern = "([A][cglmrstu]|[B][aehikr]?|[C][adeflmnorsu]?|[D][bsy]|[E][rsu]|[F][elmr]?|[G][ade]|[H][efgos]?|[I][nr]?|[K][r]?|[L][airuv]|[M][cdgnot]|[N][abdehiop]?|[O][gs]?|[P][abdmortu]?|[R][abefghnu]|[S][bcegimnr]?|[T][abcehilms]|[U]|[V]|[W]|[X][e]|[Y][b]?|[Z][nr])([0-9]*)"
        End With
        
        Set regEx2 = New RegExp
        With regEx2
            .Global = True
            '.MultiLine = True ?
            .IgnoreCase = False

            'second patternd finds parenthesis and multiplies elements within
            .Pattern = "(\((.+?)\)([0-9])+)+?"
        End With
    End If


    ' Convert input ranges to variant arrays
    If TypeName(ChemFormulaRange) = "Range" Then ChemFormulaRange = ChemFormulaRange.Value
    If TypeName(ElementRange) = "Range" Then ElementRange = ElementRange.Value

    'parameter
    npoints = UBound(ChemFormulaRange, 1) - LBound(ChemFormulaRange, 1) + 1
    mpoints = UBound(ElementRange, 2) - LBound(ElementRange, 2) + 1

    'dimension arrays
    ReDim RetValRange(1 To npoints, 1 To mpoints)

    'calculate all values
    For i = 1 To npoints
        ChemFormula = ChemFormulaRange(i, 1)
        For j = 1 To mpoints
            RetVal = 0
            Element = ElementRange(1, j)
            Call ChemRegex(ChemFormula, Element, RetVal)
            RetValRange(i, j) = RetVal
        Next
    Next

    'output answer
    CountElements = RetValRange
    
   ' Set regEx = Nothing: Set regEx2 = Nothing

End Function

Private Sub ChemRegex(ChemFormula, Element, RetVal)
Dim Matches As MatchCollection, Matches2 As MatchCollection
Dim m As Match, m2 As Match
    
    Set Matches = regEx.Execute(ChemFormula)
    For Each m In Matches
        If m.SubMatches(0) = Element Then
            RetVal = RetVal + IIf(Not m.SubMatches(1) = vbNullString, m.SubMatches(1), 1)
        End If
    Next m

    If InStr(1, ChemFormula, "(") Then ' if the formula includes elements within parentheses
        Set Matches2 = regEx2.Execute(ChemFormula)
        For Each m2 In Matches2
            Set Matches = regEx.Execute(m2.Value)
            For Each m In Matches
                If m.SubMatches(0) = Element Then
                    If m.SubMatches(1) = vbNullString Then
                        RetVal = RetVal + m2.SubMatches(2) - 1
                    Else
                        RetVal = RetVal + m.SubMatches(1) * (m2.SubMatches(2) - 1)
                    End If
                End If
            Next
        Next m2
    End If

End Sub

这当然可以通过测试 CDP1802 建议的一次 RegExp 执行中的所有元素来进一步改进,但我将把它留给你!

【讨论】:

  • 谢谢,这完全可以正常工作
【解决方案2】:

您可能会通过在一次正则表达式执行中提取所有元素而不是一次只提取一个元素来提高性能。

Option Explicit

Sub Demo()

    Dim lastrow As Long, lastcol As Long
    Dim c As Long, r As Long, d As Object
    Dim f As String, el As String, ar
    Set d = CreateObject("Scripting.Dictionary")
    
    With Sheet1
        lastrow = .Cells(.Rows.Count, "A").End(xlUp).Row
        lastcol = .Cells(1, .Columns.Count).End(xlToLeft).Column
        ar = .Cells(1, 1).Resize(lastrow, lastcol)
        
        For r = 2 To lastrow
            f = ar(r, 1)
            Call parse(d, f)
            For c = 2 To lastcol
               el = ar(1, c)
               If d.exists(el) Then
                   ar(r, c) = d(el)
               End If
            Next
            d.RemoveAll
        Next
        .Cells(1, 1).Resize(lastrow, lastcol) = ar
    End With
    
    MsgBox "Done"
    
End Sub

Sub parse(ByRef dict, s As String)
  
    Dim regex As Object
    Set regex = CreateObject("VBScript.RegExp")
    With regex
        .Global = False
        .Pattern = "[(]([^)]+)[)](\d+)"
    End With
    
    ' expand bracket into multiple entries
    Dim m, matches, sm, n As Long, el As String
    Do While regex.test(s)
        Set m = regex.Execute(s)
        For n = 1 To m(0).submatches(1)
            s = s & " " & m(0).submatches(0)
        Next
        s = regex.Replace(s, "")
    Loop
    
    ' count elements
    regex.Pattern = "(" & Symbols & ")([0-9]*)"
    regex.Global = True
    If regex.test(s) Then
        Set matches = regex.Execute(s)
        For Each m In matches
            el = m.submatches(0)
            n = Val(m.submatches(1))
            If n = 0 Then n = 1
            dict(el) = dict(el) + n
        Next
    End If
    
End Sub

Function Symbols() As String

   Symbols = "A[cglmrstu]|" & _
        "B[aehikr]?|" & _
        "C[adeflmnorsu]?|" & _
        "D[bsy]|" & _
        "E[rsu]|" & _
        "F[elmr]?|" & _
        "G[ade]|" & _
        "H[efgos]?|" & _
        "I[nr]?|" & _
        "K[r]?|" & _
        "L[airuv]|" & _
        "M[cdgnot]|" & _
        "N[abdehiop]?|" & _
        "O[gs]?|" & _
        "P[abdmortu]?|" & _
        "R[abefghnu]|" & _
        "S[bcegimnr]?|" & _
        "T[abcehilms]|" & _
        "[UVW]|" & _
        "X[e]|" & _
        "Y[b]?|" & _
        "Z[nr]"

End Function

【讨论】:

    猜你喜欢
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2014-09-05
    • 2022-01-18
    • 2019-07-29
    • 2015-12-14
    • 2021-11-20
    • 1970-01-01
    相关资源
    最近更新 更多