NLTK

NLTK is a leading platform for building Python programs to work with human language data. It provides easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet, along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning, and an active discussion forum.

Library documentation: http://www.nltk.org/

# needed to display the graphs
%matplotlib inline

# import the library and download sample texts
import nltk
nltk.download('gutenberg')
nltk.download('genesis')
nltk.download('inaugural')
nltk.download('nps_chat')
nltk.download('webtext')
nltk.download('treebank')
nltk.download('stopwords')
nltk.download('words')
nltk.download('wordnet')
nltk.download('brown')
nltk.download('names')
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

from nltk.book import *

# examine concordances (word + context)
text1.concordance("monstrous")

text1.similar("monstrous")

text2.common_contexts(["monstrous", "very"])

# see where in a text certain words are found to occur
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])

# count of all tokens (including punctuation)
len(text3)

# number of distinct tokens
len(set(text3))

# the texts are just lists of strings
text2[141525:]

# build a frequency distribution
fdist1 = FreqDist(text1) 
fdist1

fdist1.most_common(20)

fdist1['whale']

fdist1.plot(20, cumulative=True)

# apply a list comprehension to get words over 15 characters
V = set(text1)
long_words = [w for w in V if len(w) > 15]
sorted(long_words)

fdist2 = FreqDist(text5)
sorted(w for w in set(text5) if len(w) > 7 and fdist2[w] > 7)

# word sequences that appear together unusually often
text4.collocations()

Raw Text Processing

# download raw text from an online repository
from urllib import request
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')
len(raw)

raw[:75]

# tokenize the raw text
from nltk import word_tokenize
tokens = word_tokenize(raw)
len(tokens)

tokens[:10]

text = nltk.Text(tokens)
text[1024:1062]

text.collocations()

raw.find("PART I")

# HTML parsing using the Beautiful Soup library
from bs4 import BeautifulSoup
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = request.urlopen(url).read().decode('utf8')
raw = BeautifulSoup(html, 'html.parser').get_text()
tokens = word_tokenize(raw)
tokens[0:10]

# isolate just the article text
tokens = tokens[110:390]
text = nltk.Text(tokens)
text.concordance('gene')

Regular Expressions

# regular expression library
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]

# match the end of a word
[w for w in wordlist if re.search('ed$', w)][0:10]

# wildcard matches any single character
[w for w in wordlist if re.search('^..j..t..$', w)][0:10]

# combination of caret (start of word) and sets
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]

chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))

# plus symbol matches any number of times repeating
[w for w in chat_words if re.search('^m+i+n+e+$', w)]

wsj = sorted(set(nltk.corpus.treebank.words()))

# more advanced regex example
[w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)][0:10]

[w for w in wsj if re.search('^[A-Z]+\$$', w)]

[w for w in wsj if re.search('^[0-9]{4}$', w)][0:10]

[w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)][0:10]

[w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)][0:10]

[w for w in wsj if re.search('(ed|ing)$', w)][0:10]

# using "findall" to extract partial matches from words
fd = nltk.FreqDist(vs for word in wsj 
                      for vs in re.findall(r'[aeiou]{2,}', word))
fd.most_common(12)

Normalizing Text

# NLTK has several word stemmers built in
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

[porter.stem(t) for t in tokens][0:10]

[lancaster.stem(t) for t in tokens][0:10]

wnl = nltk.WordNetLemmatizer()
[wnl.lemmatize(t) for t in tokens][0:10]

# also has a tokenizer that takes a regular expression as a parameter
text = 'That U.S.A. poster-print costs $12.40...'
pattern = r'''(?x)    # set flag to allow verbose regexps
     ([A-Z]\.)+        # abbreviations, e.g. U.S.A.
   | \w+(-\w+)*        # words with optional internal hyphens
   | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
   | \.\.\.            # ellipsis
   | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
'''
nltk.regexp_tokenize(text, pattern)

Tagging

# Use a built-in tokenizer and tagger
text = word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text)

# Word similarity using a pre-tagged text
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('woman')

# Tagged words are saved as tuples
nltk.corpus.brown.tagged_words()[0:10]

nltk.corpus.brown.tagged_words(tagset='universal')[0:10]

from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.most_common()

# Part of speech tag count for words following "often" in a text
brown_lrnd_tagged = brown.tagged_words(categories='learned', tagset='universal')
tags = [b[1] for (a, b) in nltk.bigrams(brown_lrnd_tagged) if a[0] == 'often']
fd = nltk.FreqDist(tags)
fd.tabulate()

# Load some raw sentences to tag
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')

# Default tagger (assigns same tag to each token)
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
nltk.FreqDist(tags).max()
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
default_tagger.tag(tokens)

# Evaluate the performance against a tagged corpus
default_tagger.evaluate(brown_tagged_sents)

# Training a unigram tagger
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])

# Now evaluate it
unigram_tagger.evaluate(brown_tagged_sents)

# Combining taggers
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(brown_tagged_sents, backoff=t0)
t2 = nltk.BigramTagger(brown_tagged_sents, backoff=t1)
t2.evaluate(brown_tagged_sents)

Classifying Text

# Define a feature extractor
def gender_features(word):
        return {'last_letter': word[-1]}
gender_features('Shrek')

# Prepare a list of examples
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
    [(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)

# Process the names data
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

classifier.classify(gender_features('Neo'))

classifier.classify(gender_features('Trinity'))

print(nltk.classify.accuracy(classifier, test_set))

classifier.show_most_informative_features(5)

# Document classification
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

print(nltk.classify.accuracy(classifier, test_set))

classifier.show_most_informative_features(5)