Nlp-day2
Jump to navigation
Jump to search
### 1
from nltk.corpus import wordnet as wn
wn.synsets('word')
car = wn.synset('car.n.01')
car.lemma_names()
car.hyponyms()
wn.all_synsets('n')
sum(1 for n in wn.all_synsets('n'))
# What percentage of noun synsets have no hyponyms?
### 2
# What is the branching factor of noun hyponyms?
# That is, how many hyponyms on average has each noun synset?
hyponyms = [???]
sum(hyponyms) / float(len(hyponyms))
### 3
right_whale = wn.synset('right_whale.n.01')
orca = wn.synset('orca.n.01')
right_whale.lowest_common_hypernyms(orca)
baleen_whale = right_whale.hypernyms()[0]
baleen_whale.hyponyms()
right_whale.min_depth()
fruit = wn.synset('fruit.n.01')
whole = right_whale.lowest_common_hypernyms(fruit)[0]
whole
whole.min_depth()
right_whale.path_similarity(fruit)
right_whale.path_similarity(orca)
right_whale.path_similarity(right_whale)
### 4
words = ['food', 'fruit', 'car']
synsets = [synset
for word in words
for synset in wn.synsets(word, 'n')]
for s in synsets:
similarities = [s.path_similarity(t)*100 for t in synsets]
row = ' '.join('{:3.0f}'.format(w) for w in similarities)
print('{:20} {}'.format(s.name(), row))
### 5
from urllib import urlopen
url = 'http://www.gutenberg.org/files/1112/1112.txt'
raw = urlopen(url).read()
words = nltk.word_tokenize(raw)
words[:20]
text = nltk.Text(words)
text.collocations(50)
### 6
text = 'string string string'
print text
print text[:10]
print text[5:10]
print text[5]
print text[-5]
print text.find('ing')
print text.find('sales')
print 'sales' in text
print text.replace('str', 'text')
tokens = text.split(' ')
print tokens
print '-'.join(tokens)
print '\n'.join(tokens)
print text.endswith('ing')
print text.startswith('ing')
### 7
# Suppose we have room in a crossword puzzle
# for an 8-letter word with j as its third letter
# and t as its sixth letter. Find matching words.
[w for w in wordlist if re.search('^..j..t..$', w)][:10]
### 8
# The T9 system is used for entering text on mobile phones.
# Two or more words that are entered with the same sequence
# of keystrokes are known as textonyms.
# For example, both hole and golf are entered by pressing
# the sequence 4653. What other words could be produced
# with the same sequence?
# 4 = ghi
# 6 = mno
# 5 = jkl
# 3 = def
[w for w in wordlist
if re.search('^[ghi][mno][jkl][def]$', w)][:10]
### 9
wsj = sorted(set(nltk.corpus.treebank.words()))
# 1. Find all words that contain at least one digit.
# 2. Find four-digit numbers.
# 3. Find all integer numbers that have four or more digits.
# 4. Find decimal numbers.
# 5. Find words ending with ed or ing.
# 6. Find words like black-and-white, father-in-law,
# A-in-B, etc.
### 10
# What are the most common sequences of two or more vowels in
# English language?
fd = nltk.FreqDist(vs
for word in wsj
for vs in re.findall(???))
fd.most_common()
# Expected output:
# [(u'io', 549),
# (u'ea', 476),
# (u'ie', 331),
# (u'ou', 329),
# (u'ai', 261),
# (u'ia', 253),
# ...]
### 11
wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.ConditionalFreqDist(
vs
for word in wsj
for vs in re.findall('[aeiou]{2}',
word.lower()))
fd.tabulate()
### 12
# walking -> walk
# walked -> walk
# lying -> ly
# -ing, -ed, -ly, -ious, -ies, -ive, -es, -s, -ment
def stem(word):
regexp = ???
stem, suffix = re.findall(regexp, word)[0]
return stem
stem('walking') # walk
stem('go') # go
stem('lying') # ly
### 12
for word in ['walking', 'walked', 'going', 'go']:
print(re.findall('^(.*?)(ing|ed)?$', word))
for word in ['walking', 'walked', 'ing', 'going', 'go']:
print(re.findall('^(.{2,}?)(ing|ed)?$', word))
### 13
porter = nltk.PorterStemmer()
print(porter.stem('lying'))
print(porter.stem('walking'))
print(porter.stem('walk'))
print(porter.stem('walked'))
### 14
tokens = nltk.corpus.gutenberg.words('melville-moby_dick.txt')
moby = nltk.Text(tokens)
moby.findall(r"<a> (<.*>) <man>")
### 15
# Search for patterns like "A and other Bs" in brown corpus.
# Expected output:
# companies and other corporations;
# Union and other members;
# Wagner and other officials;
# grillwork and other things;