【问题标题】:Simplifying the French POS Tag Set with NLTK使用 NLTK 简化法语 POS 标签集
【发布时间】:2014-12-16 20:19:49
【问题描述】:

如何简化斯坦福法语词性标注器返回的词性标注?将一个英文句子读成 NLTK 是相当容易的,找到每个单词的词性,然后使用 map_tag() 来简化标签集:

#!/usr/bin/python
# -*- coding: utf-8 -*-

import os
from nltk.tag.stanford import POSTagger
from nltk.tokenize import word_tokenize
from nltk.tag import map_tag

#set java_home path from within script. Run os.getenv("JAVA_HOME") to test java_home
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk1.7.0_25\\bin"

english = u"the whole earth swarms with living beings, every plant, every grain and leaf, supports the life of thousands."

path_to_english_model = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\models\\english-bidirectional-distsim.tagger"
path_to_jar = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\stanford-postagger.jar"

#define english and french taggers
english_tagger = POSTagger(path_to_english_model, path_to_jar, encoding="utf-8")

#each tuple in list_of_english_pos_tuples = (word, pos)
list_of_english_pos_tuples = english_tagger.tag(word_tokenize(english))

simplified_pos_tags_english = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in list_of_english_pos_tuples]

print simplified_pos_tags_english

#output = [(u'the', u'DET'), (u'whole', u'ADJ'), (u'earth', u'NOUN'), (u'swarms', u'NOUN'), (u'with', u'ADP'), (u'living', u'NOUN'), (u'beings', u'NOUN'), (u',', u'.'), (u'every', u'DET'), (u'plant', u'NOUN'), (u',', u'.'), (u'every', u'DET'), (u'grain', u'NOUN'), (u'and', u'CONJ'), (u'leaf', u'NOUN'), (u',', u'.'), (u'supports', u'VERB'), (u'the', u'DET'), (u'life', u'NOUN'), (u'of', u'ADP'), (u'thousands', u'NOUN'), (u'.', u'.')]

但我不确定如何将以下代码返回的法语标签映射到通用标签集:

#!/usr/bin/python
# -*- coding: utf-8 -*-

import os
from nltk.tag.stanford import POSTagger
from nltk.tokenize import word_tokenize
from nltk.tag import map_tag

#set java_home path from within script. Run os.getenv("JAVA_HOME") to test java_home
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk1.7.0_25\\bin"

french = u"Chaque plante, chaque graine, chaque particule de matière organique contient des milliers d'atomes animés."

path_to_french_model = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\models\\french.tagger"
path_to_jar = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\stanford-postagger.jar"

french_tagger = POSTagger(path_to_french_model, path_to_jar, encoding="utf-8")

list_of_french_pos_tuples = french_tagger.tag(word_tokenize(french))

#up to this point all is well, but I'm not sure how to successfully create a simplified pos tagset with the French tuples
simplified_pos_tags_french = [(word, map_tag('SOME_ARGUMENT', 'universal', tag)) for word, tag in list_of_french_pos_tuples]
print simplified_pos_tags_french

有谁知道如何简化斯坦福 POS 标记器中法国模型使用的默认标记集?如果其他人可以就这个问题提供任何见解,我将不胜感激。

【问题讨论】:

    标签: python syntax nlp nltk stanford-nlp


    【解决方案1】:

    我最终只是手动将斯坦福大学的 POS 标签映射到通用标签集。值得一提的是,上面的 sn-p 是一个稍微大一点的工作流程的一部分,该工作流程旨在测量法语和英语句子之间的句法相似性。这是完整的代码,以防对其他人有所帮助:

    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    
    '''NLTK 3.0 offers map_tag, which maps the Penn Treebank Tag Set to the Universal Tagset, a course tag set with the following 12 tags:
    
    VERB - verbs (all tenses and modes)
    NOUN - nouns (common and proper)
    PRON - pronouns
    ADJ - adjectives
    ADV - adverbs
    ADP - adpositions (prepositions and postpositions)
    CONJ - conjunctions
    DET - determiners
    NUM - cardinal numbers
    PRT - particles or other function words
    X - other: foreign words, typos, abbreviations
    . - punctuation
    
    We'll map Stanford's tag set to this tag set then compare the similarity between subregions of French and English sentences.'''
    
    from __future__ import division
    import os, math
    from nltk.tag.stanford import POSTagger
    from nltk.tokenize import word_tokenize
    from nltk.tag import map_tag
    from collections import Counter
    
    #########################
    # Create Tagset Mapping #
    #########################
    
    def create_french_to_universal_dict():
        '''this function creates the dict we'll call below when we map french pos tags to the universal tag set'''
        french_to_universal = {}
        french_to_universal[u"ADJ"]    = u"ADJ"
        french_to_universal[u"ADJWH"]  = u"ADJ"
        french_to_universal[u"ADV"]    = u"ADV"
        french_to_universal[u"ADVWH"]  = u"ADV"
        french_to_universal[u"CC"]     = u"CONJ"    
        french_to_universal[u"CLO"]    = u"PRON"
        french_to_universal[u"CLR"]    = u"PRON"
        french_to_universal[u"CLS"]    = u"PRON"
        french_to_universal[u"CS"]     = u"CONJ"
        french_to_universal[u"DET"]    = u"DET"
        french_to_universal[u"DETWH"]  = u"DET"
        french_to_universal[u"ET"]     = u"X"
        french_to_universal[u"NC"]     = u"NOUN"
        french_to_universal[u"NPP"]    = u"NOUN"
        french_to_universal[u"P"]      = u"ADP"
        french_to_universal[u"PUNC"]   = u"."
        french_to_universal[u"PRO"]    = u"PRON"
        french_to_universal[u"PROREL"] = u"PRON"
        french_to_universal[u"PROWH"]  = u"PRON"
        french_to_universal[u"V"]      = u"VERB"
        french_to_universal[u"VIMP"]   = u"VERB"
        french_to_universal[u"VINF"]   = u"VERB"
        french_to_universal[u"VPP"]    = u"VERB"
        french_to_universal[u"VPR"]    = u"VERB"
        french_to_universal[u"VS"]     = u"VERB"
        #nb, I is not part of the universal tagset--interjections get mapped to X
        french_to_universal[u"I"]      = u"X"
        return french_to_universal
    
    french_to_universal_dict = create_french_to_universal_dict()
    
    def map_french_tag_to_universal(list_of_french_tag_tuples):
        '''this function reads in a list of tuples (word, pos) and returns the same list with pos mapped to universal tagset'''
        return [ (tup[0], french_to_universal_dict[ tup[1] ]) for tup in list_of_french_tag_tuples ]
    
    ###############################
    # Define Similarity Functions #
    ###############################
    
    def counter_cosine_similarity(c1, c2):
        '''this function reads in two counters and returns their cosine similarity'''
        terms = set(c1).union(c2)
        dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
        magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
        magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
        return dotprod / (magA * magB)
    
    def longest_common_subsequence_length(a, b):
        '''this function reads in two lists and returns the length of their longest common subsequence'''
        table = [[0] * (len(b) + 1) for _ in xrange(len(a) + 1)]
        for i, ca in enumerate(a, 1):
            for j, cb in enumerate(b, 1):
                table[i][j] = (
                    table[i - 1][j - 1] + 1 if ca == cb else
                    max(table[i][j - 1], table[i - 1][j]))
        return table[-1][-1]        
    
    def longest_contiguous_subsequence_length(a, b):
        '''this function reads in two lists and returns the length of their longest contiguous subsequence'''
        table = [[0] * (len(b) + 1) for _ in xrange(len(a) + 1)]
        l = 0
        for i, ca in enumerate(a, 1):
            for j, cb in enumerate(b, 1):
                if ca == cb:
                    table[i][j] = table[i - 1][j - 1] + 1
                    if table[i][j] > l:
                        l = table[i][j]
        return l
    
    def calculate_syntactic_similarity(french_pos_tuples, english_pos_tuples):
        '''this function reads in two lists of (word, pos) tuples and returns their cosine similarity, logest_common_subsequence, and longest_common_contiguous_sequence''' 
        french_pos_list           = [tup[1] for tup in french_pos_tuples]
        english_pos_list          = [tup[1] for tup in english_pos_tuples]
        french_pos_counter        = Counter(french_pos_list)
        english_pos_counter       = Counter(english_pos_list)
        cosine_similarity         = counter_cosine_similarity(french_pos_counter, english_pos_counter)
        lc_subsequence            = longest_common_subsequence_length(french_pos_counter, english_pos_counter) / max(len(french_pos_list), len(english_pos_list))
        lc_contiguous_subsequence = longest_contiguous_subsequence_length(french_pos_counter, english_pos_counter) / max(len(french_pos_list), len(english_pos_list))   
        return cosine_similarity, lc_subsequence, lc_contiguous_subsequence 
    
    ########################### 
    # Parse POS with Stanford #
    ###########################
    
    #set java_home path from within script. Run os.getenv("JAVA_HOME") to test java_home
    os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk1.7.0_25\\bin"
    
    english = u"the whole earth swarms with living beings, every plant, every grain and leaf, supports the life of thousands."
    french = u"Chaque plante, chaque graine, chaque particule de matière organique contient des milliers d'atomes animés."
    
    #specify paths 
    path_to_english_model = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\models\\english-bidirectional-distsim.tagger"
    path_to_french_model = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\models\\french.tagger"
    path_to_jar = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\stanford-postagger.jar"
    
    #define english and french taggers
    english_tagger = POSTagger(path_to_english_model, path_to_jar, encoding="utf-8")
    french_tagger = POSTagger(path_to_french_model, path_to_jar, encoding="utf-8")
    
    #each tuple in list_of_english_pos_tuples = (word, pos)
    list_of_english_pos_tuples = english_tagger.tag(word_tokenize(english))
    list_of_french_pos_tuples = french_tagger.tag(word_tokenize(french))
    
    #simplify each tagset
    simplified_pos_tags_english = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in list_of_english_pos_tuples]
    simplified_pos_tags_french = map_french_tag_to_universal( list_of_french_pos_tuples )
    
    print calculate_syntactic_similarity(simplified_pos_tags_french, simplified_pos_tags_english)
    

    【讨论】:

    • 感谢您这样做! NLTK 人员可能对您从斯坦福标签集(“Crabbe and Candito”)到通用标签集的映射感兴趣。
    • 我的荣幸!我会尝试在某个时候创建​​一个拉取请求,以便他们可以在未来的版本中包含此映射。
    • @duhaime,要感谢并说我已经获取了您的映射并创建了拉取请求,以便为通用 POS 标签项目 (github.com/slavpetrov/universal-pos-tags/pull/12) 做出贡献,感谢您和此 SO 页面。
    猜你喜欢
    • 2012-11-26
    • 1970-01-01
    • 2013-12-18
    • 1970-01-01
    • 2011-08-12
    • 2014-09-22
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多