【问题标题】:How to ignore punctuation in-between words using word_tokenize in NLTK?如何在 NLTK 中使用 word_tokenize 忽略单词之间的标点符号?
【发布时间】:2020-05-12 16:53:40
【问题描述】:

我希望使用 NLTK word_tokenize 忽略单词之间的字符。

如果我有一句话:

test = 'Should I trade on the S&P? This works with a phone number 333-445-6635 and email test@testing.com'

word_tokenize 方法将 S&P 拆分为

'S','&','P','?'

有没有办法让这个库忽略单词或字母之间的标点符号? 预期输出:'S&P','?'

【问题讨论】:

    标签: python nlp nltk tokenize


    【解决方案1】:

    让我知道你的句子是如何工作的。
    我添加了一个带有一堆标点符号的附加测试。
    正则表达式的最后一部分是从 WordPunctTokenizer 正则表达式修改而来的。

    from nltk.tokenize import RegexpTokenizer
    
    punctuation = r'[]!"$%&\'()*+,./:;=#@?[\\^_`{|}~-]?'
    tokenizer = RegexpTokenizer(r'\w+' + punctuation + r'\w+?|[^\s]+?')
    
    # result: 
    In [156]: tokenizer.tokenize(test)
    Out[156]: ['Should', 'I', 'trade', 'on', 'the', 'S&P', '?']
    
    # additional test:
    In [225]: tokenizer.tokenize('"I am tired," she said.')
    Out[225]: ['"', 'I', 'am', 'tired', ',', '"', 'she', 'said', '.']
    

    编辑:要求有所改变,因此我们可以为此稍微修改PottsTweetTokenizer

    emoticon_string = r"""
        (?:
          [<>]?
          [:;=8]                     # eyes
          [\-o\*\']?                 # optional nose
          [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth      
          |
          [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
          [\-o\*\']?                 # optional nose
          [:;=8]                     # eyes
          [<>]?
        )"""
    # Twitter symbols/cashtags:  # Added by awd, 20140410.
    # Based upon Twitter's regex described here: <https://blog.twitter.com/2013/symbols-entities-tweets>.
    cashtag_string = r"""(?:\$[a-zA-Z]{1,6}([._][a-zA-Z]{1,2})?)"""
    
    # The components of the tokenizer:
    regex_strings = (
        # Phone numbers:
        r"""
        (?:
          (?:            # (international)
            \+?[01]
            [\-\s.]*
          )?            
          (?:            # (area code)
            [\(]?
            \d{3}
            [\-\s.\)]*
          )?    
          \d{3}          # exchange
          [\-\s.]*   
          \d{4}          # base
        )"""
        ,
        # Emoticons:
        emoticon_string
        ,
        # HTML tags:
        r"""(?:<[^>]+>)"""
        ,
        # URLs:
        r"""(?:http[s]?://t.co/[a-zA-Z0-9]+)"""
        ,
        # Twitter username:
        r"""(?:@[\w_]+)"""
        ,
        # Twitter hashtags:
        r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
        ,
        # Twitter symbols/cashtags:
        cashtag_string
        ,
        # email addresses
        r"""(?:[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-])""",
        # Remaining word types:
        r"""
        (?:[a-z][^\s]+[a-z])           # Words with punctuation (modification here).
        |
        (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
        |
        (?:[\w_]+)                     # Words without apostrophes or dashes.
        |
        (?:\.(?:\s*\.){1,})            # Ellipsis dots. 
        |
        (?:\S)                         # Everything else that isn't whitespace.
        """
        )
    word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)
    # The emoticon and cashtag strings get their own regex so that we can preserve case for them as needed:
    emoticon_re = re.compile(emoticon_string, re.VERBOSE | re.I | re.UNICODE)
    cashtag_re = re.compile(cashtag_string, re.VERBOSE | re.I | re.UNICODE)
    
    # These are for regularizing HTML entities to Unicode:
    html_entity_digit_re = re.compile(r"&#\d+;")
    html_entity_alpha_re = re.compile(r"&\w+;")
    amp = "&amp;"
    
    class CustomTweetTokenizer(object):
        def __init__(self, *, preserve_case: bool=False):
            self.preserve_case = preserve_case
    
        def tokenize(self, tweet: str) -> list:
            """
            Argument: tweet -- any string object.
            Value: a tokenized list of strings; concatenating this list returns the original string if preserve_case=True
            """
            # Fix HTML character entitites:
            tweet = self._html2unicode(tweet)
            # Tokenize:
            matches = word_re.finditer(tweet)
            if self.preserve_case:
                return [match.group() for match in matches]
            return [self._normalize_token(match.group()) for match in matches]
    
        @staticmethod
        def _normalize_token(token: str) -> str:
    
            if emoticon_re.search(token):
                # Avoid changing emoticons like :D into :d
                return token
            if token.startswith('$') and cashtag_re.search(token):
                return token.upper()
            return token.lower()
    
        @staticmethod
        def _html2unicode(tweet: str) -> str:
            """
            Internal method that seeks to replace all the HTML entities in
            tweet with their corresponding unicode characters.
            """
            # First the digits:
            ents = set(html_entity_digit_re.findall(tweet))
            if len(ents) > 0:
                for ent in ents:
                    entnum = ent[2:-1]
                    try:
                        entnum = int(entnum)
                        tweet = tweet.replace(ent, chr(entnum))
                    except:
                        pass
            # Now the alpha versions:
            ents = set(html_entity_alpha_re.findall(tweet))
            ents = filter((lambda x: x != amp), ents)
            for ent in ents:
                entname = ent[1:-1]
                try:
                    tweet = tweet.replace(ent, chr(html.entities.name2codepoint[entname]))
                except:
                    pass
                tweet = tweet.replace(amp, " and ")
            return tweet
    

    测试一下:

    tknzr = CustomTweetTokenizer(preserve_case=True)
    tknzr.tokenize(test)
    
    # result:
    ['Should',
     'I',
     'trade',
     'on',
     'the',
     'S&P',
     '?',
     'This',
     'works',
     'with',
     'a',
     'phone',
     'number',
     '333-445-6635',
     'and',
     'email',
     'test@testing.com']
    

    【讨论】:

    • 很酷!有没有办法查看 nltk 使用的默认正则表达式?
    • 当然,输入help(RegexpTokenizer)。我很高兴答案有所帮助!
    • 再次感谢!保持安全
    • 我意识到我还需要在该表达式中捕获电话号码和电子邮件地址。如何对其进行修改呢?我编辑了我的原始测试字符串
    • @dataviews:让我看看我能做什么。感谢您告诉我有关 TweetTokenizer 的信息。
    【解决方案2】:

    跟进@mechanical_meat 的回答,

    NLTK 中有一个 twitter 文本标记器

    很可能,它源自PottsTweetTokenizerhttps://github.com/nltk/nltk/blob/develop/nltk/tokenize/casual.py

    from nltk.tokenize import TweetTokenizer
    
    tt = TweetTokenizer()
    text = 'Should I trade on the S&P? This works with a phone number 333-445-6635 and email test@testing.com'
    print(tt.tokenize(text))
    

    [出]:

    ['Should', 'I', 'trade', 'on', 'the', 'S', '&', 'P', '?', 'This', 'works', 'with', 'a', 'phone', 'number', '333-445-6635', 'and', 'email', 'test@testing.com']
    

    但这并不能解决S&amp;P 的问题!!

    所以你可以试试多词表达方法,见https://stackoverflow.com/a/55644296/610569

    from nltk import word_tokenize
    from nltk.tokenize import TweetTokenizer
    from nltk.tokenize import MWETokenizer
    
    def multiword_tokenize(text, mwe, tokenize_func=word_tokenize):
        # Initialize the MWETokenizer
        protected_tuples = [tokenize_func(word) for word in mwe]
        protected_tuples_underscore = ['_'.join(word) for word in protected_tuples]
        tokenizer = MWETokenizer(protected_tuples)
        # Tokenize the text.
        tokenized_text = tokenizer.tokenize(tokenize_func(text))
        # Replace the underscored protected words with the original MWE
        for i, token in enumerate(tokenized_text):
            if token in protected_tuples_underscore:
                tokenized_text[i] = mwe[protected_tuples_underscore.index(token)]
        return tokenized_text
    
    
    text = 'Should I trade on the S&P? This works with a phone number 333-445-6635 and email test@testing.com'
    mwe = ['S&P']
    
    tt = TweetTokenizer()
    print(multiword_tokenize(text, mwe, tt.tokenize))
    

    [出]:

    ['Should', 'I', 'trade', 'on', 'the', 'S&P', '?', 'This', 'works', 'with', 'a', 'phone', 'number', '333-445-6635', 'and', 'email', 'test@testing.com']
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 2011-03-06
      • 2016-09-03
      • 2013-03-10
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多