Skip to content

0.1 - Iberian Brill

Latest
Compare
Choose a tag to compare
@JarbasAl JarbasAl released this 31 May 20:14
· 5 commits to master since this release

NLTK

Postag

nltk does not come with pre-trained pos taggers for most languages

Portuguese

brill tagger trained on floresta and mac_morpho corpus

import pickle
from nltk import word_tokenize

with open("brill_tagger_floresta_mcmorpho_pt.pkl", "rb") as f:
    tagger = pickle.load(f)
    
tokens = word_tokenize("Olá, o meu nome é Joaquim")
postagged = tagger.tag(tokens)
# [('Olá', 'NOUN'), (',', '.'), ('o', 'DET'), ('meu', 'PRON'), ('nome', 'NOUN'), ('é', 'VERB'), ('Joaquim', 'NOUN')]

Spanish

brill tagger trained on cess_esp corpus

import pickle
from nltk import word_tokenize

with open("brill_tagger_cess_es.pkl", "rb") as f:
    tagger = pickle.load(f)
    
tokens = word_tokenize("Hola, mi nombre es Daniel")
postagged = tagger.tag(tokens)
# [('Hola', 'NOUN'), (',', 'fc'), ('mi', 'DET'), ('nombre', 'NOUN'), ('es', 'VERB'), ('Daniel', 'NOUN')]

Catalan

brill tagger trained on cess_cat corpus

import pickle
from nltk import word_tokenize

with open("brill_tagger_cess_ca.pkl", "rb") as f:
    tagger = pickle.load(f)
    
tokens = word_tokenize("Quién es el presidente de Cataluña?")
postagged = tagger.tag(tokens)
# [('Quién', 'NOUN'), ('es', 'PRON'), ('el', 'DET'), ('presidente', 'NOUN'), ('de', 'ADP'), ('Cataluña', 'NOUN'), ('?', 'fit')]