June 25, 2019

Python nltk package (NLP)

nltk Package & NLP in Python

import nltk # Natural Language Tool Kit
nltk.download()

  corpora -- body of text
  lexicon -- words and their means

nltk.download("popular")
nltk.download('book')
nltk.download('stopwords')

>>>from nltk.book import *
from nltk.tokenize import sent_tokenize, word_tokenize
print(sent_tokenize(EXAMPLE_TEXT))
sents = nltk.sent_tokenize(paragraph)
>>> tokens = nltk.word_tokenize(sentence)
tokenized_sent = word_tokenize(sentences[3])
my_lines = [tokenize(l) for l in lines]
>>> tagged = nltk.pos_tag(tokens)
pos_tags = nltk.pos_tag(nltk.word_tokenize(sent))
regexp_tokenize(r"(#\d\w+\?!)", sent)
regexp_tokenize(r"#\w+", tweets)
regexp_tokenize(tweets[-1], r"([@#]\w+)")
print(regexp_tokenize(german_text, "['\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F']"))

from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
get_pos_tags("Ram killed Ravan")
get_noun_phrases(get_pos_tags("Satya bought 300 shares of ABC Inc. in 2016."))

      pos - part of speech

from nltk.chunk import tree2conlltags
>>> entities = nltk.chunk.ne_chunk(tagged)
ner_chunks = nltk.ne_chunk(pos_tags)
ner_tagged = tree2conlltags(ner_chunks)
get_ner_tags(get_pos_tags('Satya bought 300 shares of ABC Inc.. in 2016.'))
>>> t = treebank.parsed_sents('wj_0001.mrg')[0]
>>> from nltk.corpus import treebank, stopwords
>>> t.draw()
_word_tokenize = TreebankWordTokenizer().tokenize
        chunkParser = nltk.RegexpParser(grammar)
        chunked = chunkParser.parse(pos_tags)
        phrases = get_np_chunks(doc)

>>> from nltk.tokenize import PunktWordTokenizer
>>> punkt_word_tokenizer = PunktWordTokenizer()
>>> punkt_word_tokenizer.tokenize("this's a test")
>>> from nltk.tokenize import PunktSentenceTokenizer
>> from nltk.corpus import brown, twitter_samples, state_union
>>> brown.words()[0:10]
>>> brown.tagged_words()[0:10]

>>> import nltk.data
>>> tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
>>> tokenizer.tokenize(text)
stop_words = set(stopwords.words("english"))
twitter_samples.fileids()
twitter_samples.strings('tweets.20170430-223406.json')
tweets_tokens = twitter_samples.tokenized('positive_tweets.json')
txt = state_union.raw("2006-GWBush.txt")
token = PunktSentenceTokenizer(txt)

from nltk.tag import pos_tag_sents
tweets_tagged = pos_tag_sents(tweets_tokens)

from nltk.stem import PorterStemmer
ps = PorterStemmer()
ps.stem(word)
english_stopwords = nltk.corpus.stopwords.words('english')

from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer("english")
stemmer(['runs', 'walking'])
stemmer(remove_stopwords(get_spell_corrections(tokenization(convert_lower(remove_nonalphanumeric(text))))))

from nltk import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
lemmatizer(['ran'], ['VB'])

from nltk.corpus import wordnet
wordnet.synsets("bank")
w2 = wordnet.synset('bank.n.06')
wordnet.synsets("bank")[5].definition()
print(w1.wup_similarity(w2))

from autocorrect import spell
get_spell_corrections("right")

Related Python Articles:  String Functions in Python    datetime time dateutil calendar Packages

No comments:

Post a Comment