>>> from nltk.corpus import brown
>>> cfd = nltk.ConditionalFreqDist((genre,word)
for genre in brown.categories()
for word in brown.words(categories=genre))
>>> cfd.items()
[('mystery', <FreqDist with 6982 samples and 57169 outcomes>), ('belles_lettres', <FreqDist with 18421 samples and 173096 outcomes>), ('humor', <FreqDist with 5017 samples and 21695 outcomes>), ('government', <FreqDist with 8181 samples and 70117 outcomes>),
('fiction', <FreqDist with 9302 samples and 68488 outcomes>), ('reviews', <FreqDist with 8626 samples and 40704 outcomes>), ('religion', <FreqDist with 6373 samples and 39399 outcomes>), ('romance', <FreqDist with 8452 samples and 70022 outcomes>), ('science_fiction',
<FreqDist with 3233 samples and 14470 outcomes>), ('adventure', <FreqDist with 8874 samples and 69342 outcomes>), ('editorial', <FreqDist with 9890 samples and 61604 outcomes>), ('hobbies', <FreqDist with 11935 samples and 82345 outcomes>), ('lore', <FreqDist
with 14503 samples and 110299 outcomes>), ('news', <FreqDist with 14394 samples and 100554 outcomes>), ('learned', <FreqDist with 16859 samples and 181888 outcomes>)]
from __future__ import division
def lexical_diversity(text):
return len(text) / len(set(text))
def lexical_diversity(my_text_data):
word_count = len(my_text_data)
vocab_size = len(set(my_text_data))
diversity_score = word_count / vocab_size
return diversity_score
def plural(word):
if word.endswith('y'):
return word[:-1] + 'ies'
elif word[-1] in 'sx' or word[-2:] in ['sh', 'ch']:
return word + 'es'
elif word.endswith('an'):
return word[:-2] + 'en'
else:
return word + 's'