Friday, November 15, 2013

pseudo-code in python syntax for finding KLD:

tokens = nltk.word_tokenize(document)

words = sorted(set(tokens))

porter = nltk.PorterStemmer()
words = [porter.stem(t) for t in words]


wnl = nltk.WordNetLemmatizer()
lemma = [wnl.lemmatize(t) for t in words]
vocabulary = lemma[1:200] 
# categories
news = brown.words(categories = 'news')
editorial = brown.words(categories = 'editorial')
reviews = brown.words(categories = 'reviews')
hobbies = brown.words(categories = 'hobbies')

fdist-news = nltk.FreqDist(news)
fdist-editorial = nltk.FreqDist(editorial)
fdist-reviews = nltk.FreqDist(reviews)
fdist-hobbies = nltk.FreqDist(hobbies)
fdist-document = nltk.FreqDist(tokens)

def Ptk-d(term): return fdist-document[term] / sum([fdist-document[term] for term in words])
def Ptk-editorial(term): return fdist-document[term] / sum([fdist-document[term] for term in editorial])
def Ptk-reviews(term): return fdist-document[term] / sum([fdist-document[term] for term in reviews])
def Ptk-hobbies(term): = return fdist-document[term] / sum([fdist-document[term] for term in hobbies])
def Ptk-document(term): = return fdist-document[term] / sum([fdist-document[term] for term in document])

KLD-editorial = sum([(Ptk-editorial(term) - Ptk-d(term)) * log (Ptk-editorial(term)/Ptk-d(term) for term in vocabulary)] 


[in progress]

 

No comments:

Post a Comment