【问题标题】:Pandas: Update column on condition with regex熊猫:使用正则表达式更新列
【发布时间】:2019-10-19 02:50:50
【问题描述】:

所以我有一个列,它是 Pandas Dataframe 中的对象类型的列。

它包括以下数据:

array(['9.4', '9.8', '10', '9.5', '10.5', '9.2', '9.9', '9.1', '9.3', '9',
   '9.7', '10.1', '10.6', '9.6', '10.8', '10.3', '13.1', '10.2',
   '10.9', '10.7', '12.9', '10.4', '13', '14', '11.5', '11.4', '12.4',
   '11', '12.2', '12.8', '12.6', '12.5', '11.7', '11.3', '12.3', '12',
   '11.9', '11.8', '8.7', '13.3', '11.2', '11.6', '11.1', '13.4',
   '12.1', '8.4', '12.7', '14.9', '13.2', '13.6', '13.5',
   '100.333.333.333.333', '9.55', '8.5', '110.666.666.666.667',
   '956.666.666.666.667', '10.55', '8.8', '135.666.666.666.667',
   '11.95', '9.95', '923.333.333.333.333', '9.25', '9.05', '10.75',
   '8.6', '8.9', '13.9', '13.7', '8', '8.0', '14.2', '11.94',
   '128.933.333.333.333', '114.666.666.666.667', '10.98',
   '114.333.333.333.333', '105.333.333.333.333',
   '953.333.333.333.333', '109.333.333.333.333',
   '113.666.666.666.667', '113.333.333.333.333',
   '973.333.333.333.333', '11.05', '9.75', '11.35', '11.45', '14.05',
   '123.333.333.333.333', '12.75', '13.8', '12.15', '13.05',
   '112.666.666.666.667', '105.666.666.666.667',
   '117.333.333.333.333', '11.75', '10.65', '109.666.666.666.667',
   '101.333.333.333.333', '10.15', '104.666.666.666.667',
   '116.333.333.333.333', '12.25', '11.85', '11.65', '13.55',
   '131.333.333.333.333', '120.666.666.666.667', '11.55',
   '963.333.333.333.333', '12.05'], dtype=object)

我想用 0 之类的东西更新那些有多个点 ('.') 的点。我对 regex-es 不是很熟悉,但我的想法是使用 regex 来解决这个问题,而不是 '953.333.333.333.333'!

DF.replace({'column': '953.333.333.333.333'},'0')

非常感谢!

【问题讨论】:

    标签: python regex pandas dataframe conditional-statements


    【解决方案1】:

    numpy.whereSeries.str.countSeries.gt 一起使用:

    DF['column'] = np.where(DF['column'].str.count('\.').gt(1), 0, DF['column'])
    

    [出]

    array(['9.4', '9.8', '10', '9.5', '10.5', '9.2', '9.9', '9.1', '9.3', '9',
           '9.7', '10.1', '10.6', '9.6', '10.8', '10.3', '13.1', '10.2',
           '10.9', '10.7', '12.9', '10.4', '13', '14', '11.5', '11.4', '12.4',
           '11', '12.2', '12.8', '12.6', '12.5', '11.7', '11.3', '12.3', '12',
           '11.9', '11.8', '8.7', '13.3', '11.2', '11.6', '11.1', '13.4',
           '12.1', '8.4', '12.7', '14.9', '13.2', '13.6', '13.5', 0, '9.55',
           '8.5', 0, 0, '10.55', '8.8', 0, '11.95', '9.95', 0, '9.25', '9.05',
           '10.75', '8.6', '8.9', '13.9', '13.7', '8', '8.0', '14.2', '11.94',
           0, 0, '10.98', 0, 0, 0, 0, 0, 0, 0, '11.05', '9.75', '11.35',
           '11.45', '14.05', 0, '12.75', '13.8', '12.15', '13.05', 0, 0, 0,
           '11.75', '10.65', 0, 0, '10.15', 0, 0, '12.25', '11.85', '11.65',
           '13.55', 0, 0, '11.55', 0, '12.05'], dtype=object)
    

    【讨论】:

    • @Eva 很高兴它有帮助:)
    • @Eva 相同的想法,但使用 Pandas 方法。 df.assign(column=df.column.mask(df.column.str.count('\.').gt(1), 0))
    【解决方案2】:

    这个表达式可以简单地捕获所需元素中的.

    '\d+\.\d+'|'\d+'|(\.)
    

    使用这个捕获组:

    (\.)
    

    Demo

    测试

    # coding=utf8
    # the above tag defines encoding for this document and is for Python 2.x compatibility
    
    import re
    
    regex = r"'\d+\.\d+'|'\d+'|(\.)"
    
    test_str = ("'9.4', '9.8', '10', '9.5', '10.5', '9.2', '9.9', '9.1', '9.3', '9',\n"
        "   '9.7', '10.1', '10.6', '9.6', '10.8', '10.3', '13.1', '10.2',\n"
        "   '10.9', '10.7', '12.9', '10.4', '13', '14', '11.5', '11.4', '12.4',\n"
        "   '11', '12.2', '12.8', '12.6', '12.5', '11.7', '11.3', '12.3', '12',\n"
        "   '11.9', '11.8', '8.7', '13.3', '11.2', '11.6', '11.1', '13.4',\n"
        "   '12.1', '8.4', '12.7', '14.9', '13.2', '13.6', '13.5',\n"
        "   '100.333.333.333.333', '9.55', '8.5', '110.666.666.666.667',\n"
        "   '956.666.666.666.667', '10.55', '8.8', '135.666.666.666.667',\n"
        "   '11.95', '9.95', '923.333.333.333.333', '9.25', '9.05', '10.75',\n"
        "   '8.6', '8.9', '13.9', '13.7', '8', '8.0', '14.2', '11.94',\n"
        "   '128.933.333.333.333', '114.666.666.666.667', '10.98',\n"
        "   '114.333.333.333.333', '105.333.333.333.333',\n"
        "   '953.333.333.333.333', '109.333.333.333.333',\n"
        "   '113.666.666.666.667', '113.333.333.333.333',\n"
        "   '973.333.333.333.333', '11.05', '9.75', '11.35', '11.45', '14.05',\n"
        "   '123.333.333.333.333', '12.75', '13.8', '12.15', '13.05',\n"
        "   '112.666.666.666.667', '105.666.666.666.667',\n"
        "   '117.333.333.333.333', '11.75', '10.65', '109.666.666.666.667',\n"
        "   '101.333.333.333.333', '10.15', '104.666.666.666.667',\n"
        "   '116.333.333.333.333', '12.25', '11.85', '11.65', '13.55',\n"
        "   '131.333.333.333.333', '120.666.666.666.667', '11.55',\n"
        "   '963.333.333.333.333', '12.05'")
    
    matches = re.finditer(regex, test_str, re.MULTILINE)
    
    for matchNum, match in enumerate(matches, start=1):
    
        print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
    
        for groupNum in range(0, len(match.groups())):
            groupNum = groupNum + 1
    
            print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
    
    # Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
    

    【讨论】:

      【解决方案3】:

      最好写一个函数,然后使用df.column.apply(function)

      函数如下:

      def fun(val):
          if (len(val.split('.')) > 2) :
              return '0'
          else:
              return val
      

      【讨论】:

        猜你喜欢
        • 2018-04-30
        • 1970-01-01
        • 2017-06-18
        • 2016-12-31
        • 2015-11-18
        • 2020-11-14
        • 2020-10-23
        • 2018-06-23
        相关资源
        最近更新 更多