Pszlachta1 at 10:12, 3 June 2025

2025-06-03T10:12:57Z

← Older revision		Revision as of 10:12, 3 June 2025
Line 157:		Line 157:
	???		???

	</~~source~~>		</syntaxhighlight>

Pszlachta1 at 10:10, 3 June 2025

2025-06-03T10:10:38Z

← Older revision		Revision as of 10:10, 3 June 2025
Line 1:		Line 1:
	{{Cat\|NLP}}		{{Cat\|NLP}}
	<~~source~~ lang="python">		<syntaxhighlight lang="python">

	### 1		### 1

Bernard Szlachta at 14:20, 16 December 2016

2016-12-16T14:20:01Z

New page

{{Cat|NLP}}
<source lang="python">

### 1

words = ('The Tragedie of Macbeth by William Shakespeare . '
'The Tragedie is great .').split(' ')
trigrams = nltk.trigrams(words)
mapper = {}
for a, b, c in trigrams:
key = (a, b)
if key not in mapper:
mapper[key] = []
mapper[key].append(c)
mapper

### 2

from random import choice

def generate(word1, word2, N):
sent = [word1, word2]
for i in xrange(N):
words = mapper[(word1, word2)]
next_word = choice(words)
sent.append(next_word)
word1, word2 = word2, next_word
return sent

word1, word2 = 'The', 'Tragedie'
N = 5
generate(word1, word2, N)

### 3

# words = ('The Tragedie of Macbeth by William Shakespeare . '
# 'The Tragedie is great .').split(' ')
words = nltk.corpus.gutenberg.words('austen-emma.txt')
trigrams = nltk.trigrams(words)
mapper = {}
for a, b, c in trigrams:
key = (a, b)
if key not in mapper:
mapper[key] = []
mapper[key].append(c)
mapper

from random import choice

def generate(word1, word2, N):
sent = [word1, word2]
for i in xrange(N):
words = mapper[(word1, word2)]
next_word = choice(words)
sent.append(next_word)
word1, word2 = word2, next_word
return sent

word1, word2 = choice(mapper.keys())
N = 50
tokens = generate(word1, word2, N)
print ' '.join(tokens)

### 4

# Exercise:
# Write function find_language that takes a word and returns a list of language that this word may be in. Use udhr corpus. Narrow down your search scope to files in Latin1 encoding. Lookup words one and ein.

files = [filename
for filename in nltk.corpus.udhr.fileids()
if filename.endswith('Latin1')]
files

def find_language(word):
return [
language
for language in files
if word in nltk.corpus.udhr.words(language)
]

find_language('one')

### 5

words = nltk.corpus.gutenberg.words('austen-emma.txt')
words = [w.lower() for w in words if w.isalpha()]
fd = nltk.FreqDist(words)

fd['the']

fd.freq('the')

fd.most_common()[:10]

### 6

# Zipf’s Law states that the frequency of a word is inversely proportional to its rank. Does this law holds for English?

# Hint: use logarithmic x and y axes when plotting number of occurrences against rank.

fd = nltk.FreqDist(nltk.corpus.gutenberg.words('bible-kjv.txt'))
occurences = [? for x in fd.most_common()]

from matplotlib import pylab
pylab.xscale('log')
pylab.yscale('log')
pylab.plot(occurences)

### 7

cfd = nltk.ConditionalFreqDist(
(category, word)
for category in nltk.corpus.brown.categories()
for word in nltk.corpus.brown.words(categories=category)
)

### 8

# How frequently do letters appear in different languages?
# Use swadesh corpus.
cfd = nltk.ConditionalFreqDist([
???
for language in ???
for letter in ???
if ??? # if letter is a character, not whitespace or punctuation
])
cfd.tabulate(samples='abcdefghijklmnopqrstuvwxyz')

### 9

# Which initial letters are more frequent for males versus females?
# Use names corpus.

cfd = nltk.ConditionalFreqDist([
???
for gender in ???
for name in ???
])
cfd.plot()

### 10

# What is the lexical diversity of different genres? Lexical diversity is the ratio of number of words and number of unique words. That is, it’s the average number of times each word appear in the text.
# Use brown corpus.

# 1. Write the function that computes lexical diversity:

def lexical_diversity(words):
return ???

# 2. Test that lexical_diversity function works:
>>> lexical_diversity(['a', 'b', 'b'])
1.5

# 3. Compute lexical diversity for each category from brown corpus.
for category in ???:
???

</source>

Nlp - Revision history

Pszlachta1 at 10:12, 3 June 2025

Pszlachta1 at 10:10, 3 June 2025

Bernard Szlachta at 14:20, 16 December 2016