Nlp

From Training Material
Jump to navigation Jump to search
### 1

words = ('The Tragedie of Macbeth by William Shakespeare . '
         'The Tragedie is great .').split(' ')
trigrams = nltk.trigrams(words)
mapper = {}
for a, b, c in trigrams:
    key = (a, b)
    if key not in mapper:
        mapper[key] = []
    mapper[key].append(c)
mapper

### 2

from random import choice

def generate(word1, word2, N):
    sent = [word1, word2]
    for i in xrange(N):
        words = mapper[(word1, word2)]
        next_word = choice(words)
        sent.append(next_word)
        word1, word2 = word2, next_word
    return sent
    
word1, word2 = 'The', 'Tragedie'
N = 5
generate(word1, word2, N)

### 3

# words = ('The Tragedie of Macbeth by William Shakespeare . '
#          'The Tragedie is great .').split(' ')
words = nltk.corpus.gutenberg.words('austen-emma.txt')
trigrams = nltk.trigrams(words)
mapper = {}
for a, b, c in trigrams:
    key = (a, b)
    if key not in mapper:
        mapper[key] = []
    mapper[key].append(c)
mapper

from random import choice

def generate(word1, word2, N):
    sent = [word1, word2]
    for i in xrange(N):
        words = mapper[(word1, word2)]
        next_word = choice(words)
        sent.append(next_word)
        word1, word2 = word2, next_word
    return sent
    
word1, word2 = choice(mapper.keys())
N = 50
tokens = generate(word1, word2, N)
print ' '.join(tokens)

### 4 

# Exercise: 
# Write function find_language that takes a word and returns a list of language that this word may be in. Use udhr corpus. Narrow down your search scope to files in Latin1 encoding. Lookup words one and ein.

files = [filename 
         for filename in nltk.corpus.udhr.fileids()
         if filename.endswith('Latin1')]
files

def find_language(word):
    return [
        language
        for language in files 
        if word in nltk.corpus.udhr.words(language)
    ]

find_language('one')

### 5

words = nltk.corpus.gutenberg.words('austen-emma.txt')
words = [w.lower() for w in words if w.isalpha()]
fd = nltk.FreqDist(words)

fd['the']

fd.freq('the')

fd.most_common()[:10]

### 6

# Zipf’s Law states that the frequency of a word is inversely proportional to its rank. Does this law holds for English?

# Hint: use logarithmic x and y axes when plotting number of occurrences against rank.

fd = nltk.FreqDist(nltk.corpus.gutenberg.words('bible-kjv.txt'))
occurences = [? for x in fd.most_common()]

from matplotlib import pylab
pylab.xscale('log')
pylab.yscale('log')
pylab.plot(occurences)

### 7

cfd = nltk.ConditionalFreqDist(
    (category, word)
    for category in nltk.corpus.brown.categories()
    for word in nltk.corpus.brown.words(categories=category)
)

### 8 

# How frequently do letters appear in different languages?
# Use swadesh corpus.
cfd = nltk.ConditionalFreqDist([
    ???
    for language in ???
    for letter in ???
    if ???  # if letter is a character, not whitespace or punctuation
])
cfd.tabulate(samples='abcdefghijklmnopqrstuvwxyz')

### 9

# Which initial letters are more frequent for males versus females?
# Use names corpus.

cfd = nltk.ConditionalFreqDist([
    ???
    for gender in ???
    for name in ???
])
cfd.plot()

### 10

# What is the lexical diversity of different genres? Lexical diversity is the ratio of number of words and number of unique words. That is, it’s the average number of times each word appear in the text.
# Use brown corpus.

# 1. Write the function that computes lexical diversity:

def lexical_diversity(words):
    return ???

# 2. Test that lexical_diversity function works:
>>> lexical_diversity(['a', 'b', 'b'])
1.5

# 3. Compute lexical diversity for each category from brown corpus. 
for category in ???:
    ???