








  • TF:词频
  • IDF:突出关键词,和其它文档中不同的部分
import numpy as np
from collections import Counter
import itertools
from visual import show_tfidf # this refers to visual.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)

docs = [
"it is a good day, I like to stay here",
"I am happy to be here",
"I am bob",
"it is sunny today",
"I have a party today",
"it is a dog and that is a cat",
"there are dog and cat on the tree",
"I study hard this morning",
"today is a good day",
"tomorrow will be a good day",
"I like coffee, I like book and I like apple",
"I do not like it",
"I am kitty, I like bob",
"I do not care who like bob, but I like kitty",
"It is coffee time, bring your cup",

# 将文档的单词转换成ID形式
docs_words = [d.replace(",", "").split(" ") for d in docs]
vocab = set(itertools.chain(*docs_words))
v2i = {v: i for i, v in enumerate(vocab)}
i2v = {i: v for v, i in v2i.items()}

def safe_log(x):
mask = x != 0
x[mask] = np.log(x[mask])
return x

tf_methods = {
"log": lambda x: np.log(1+x),
"augmented": lambda x: 0.5 + 0.5 * x / np.max(x, axis=1, keepdims=True),
"boolean": lambda x: np.minimum(x, 1),
"log_avg": lambda x: (1 + safe_log(x)) / (1 + safe_log(np.mean(x, axis=1, keepdims=True))),
idf_methods = {
"log": lambda x: 1 + np.log(len(docs) / (x+1)),
"prob": lambda x: np.maximum(0, np.log((len(docs) - x) / (x+1))),
"len_norm": lambda x: x / (np.sum(np.square(x))+1),

def get_tf(method="log"):
# term frequency: how frequent a word appears in a doc
_tf = np.zeros((len(vocab), len(docs)), dtype=np.float64) # [n_vocab, n_doc]
for i, d in enumerate(docs_words):
counter = Counter(d)
for v in counter.keys():
_tf[v2i[v], i] = counter[v] / counter.most_common(1)[0][1]

weighted_tf = tf_methods.get(method, None)
if weighted_tf is None:
raise ValueError
return weighted_tf(_tf)

def get_idf(method="log"):
# inverse document frequency: low idf for a word appears in more docs, mean less important
df = np.zeros((len(i2v), 1))
for i in range(len(i2v)):
d_count = 0
for d in docs_words:
d_count += 1 if i2v[i] in d else 0
df[i, 0] = d_count

idf_fn = idf_methods.get(method, None)
if idf_fn is None:
raise ValueError
return idf_fn(df)

def cosine_similarity(q, _tf_idf):
unit_q = q / np.sqrt(np.sum(np.square(q), axis=0, keepdims=True))
unit_ds = _tf_idf / np.sqrt(np.sum(np.square(_tf_idf), axis=0, keepdims=True))
similarity = unit_ds.T.dot(unit_q).ravel()
return similarity

def docs_score(q, len_norm=False):
q_words = q.replace(",", "").split(" ")

# add unknown words
unknown_v = 0
for v in set(q_words):
if v not in v2i:
v2i[v] = len(v2i)
i2v[len(v2i)-1] = v
unknown_v += 1
if unknown_v > 0:
_idf = np.concatenate((idf, np.zeros((unknown_v, 1), dtype=np.float)), axis=0)
_tf_idf = np.concatenate((tf_idf, np.zeros((unknown_v, tf_idf.shape[1]), dtype=np.float)), axis=0)
_idf, _tf_idf = idf, tf_idf
counter = Counter(q_words)
q_tf = np.zeros((len(_idf), 1), dtype=np.float) # [n_vocab, 1]
for v in counter.keys():
q_tf[v2i[v], 0] = counter[v]

q_vec = q_tf * _idf # [n_vocab, 1]

q_scores = cosine_similarity(q_vec, _tf_idf)
if len_norm:
len_docs = [len(d) for d in docs_words]
q_scores = q_scores / np.array(len_docs)
return q_scores

def get_keywords(n=2):
for c in range(3):
col = tf_idf[:, c]
idx = np.argsort(col)[-n:]# 将矩阵col按照axis(默认-1)进行排序,返回排序后的下标
print("doc{}, top{} keywords {}".format(c, n, [i2v[i] for i in idx]))

tf = get_tf() # [n_vocab, n_doc]
idf = get_idf() # [n_vocab, 1]
tf_idf = tf * idf # [n_vocab, n_doc]
print("tf shape(vecb in each docs): ", tf.shape)
print("\ntf samples:\n", tf[:2])
print("\nidf shape(vecb in all docs): ", idf.shape)
print("\nidf samples:\n", idf[:2])
print("\ntf_idf shape: ", tf_idf.shape)
print("\ntf_idf sample:\n", tf_idf[:2])

# test
q = "I get a coffee cup"
scores = docs_score(q)
d_ids = scores.argsort()[-3:][::-1]
print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in d_ids]))

show_tfidf(tf_idf.T, [i2v[i] for i in range(tf_idf.shape[0])], "tfidf_matrix")



from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from visual import show_tfidf # this refers to visual.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)

docs = [
"it is a good day, I like to stay here",
"I am happy to be here",
"I am bob",
"it is sunny today",
"I have a party today",
"it is a dog and that is a cat",
"there are dog and cat on the tree",
"I study hard this morning",
"today is a good day",
"tomorrow will be a good day",
"I like coffee, I like book and I like apple",
"I do not like it",
"I am kitty, I like bob",
"I do not care who like bob, but I like kitty",
"It is coffee time, bring your cup",

vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(docs)

print("idf: ", [(n, idf) for idf, n in zip(vectorizer.idf_, vectorizer.get_feature_names())])
print("v2i: ", vectorizer.vocabulary_)

q = "I get a coffee cup"
qtf_idf = vectorizer.transform([q])#计算q的idf
res = cosine_similarity(tf_idf, qtf_idf) #比对
res = res.ravel().argsort()[-3:]
print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in res[::-1]]))

i2v = {i: v for v, i in vectorizer.vocabulary_.items()}
dense_tfidf = tf_idf.todense()
show_tfidf(dense_tfidf, [i2v[i] for i in range(dense_tfidf.shape[1])], "tfidf_sklearn_matrix")



# [Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/pdf/1301.3781.pdf)
from tensorflow import keras
import tensorflow as tf
from utils import process_w2v_data # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)
from visual import show_w2v_word_embedding # this refers to visual.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)

corpus = [
# numbers
"5 2 4 8 6 2 3 6 4",
"4 8 5 6 9 5 5 6",
"1 1 5 2 3 3 8",
"3 6 9 6 8 7 4 6 3",
"8 9 9 6 1 4 3 4",
"1 0 2 0 2 1 3 3 3 3 3",
"9 3 3 0 1 4 7 8",
"9 9 8 5 6 7 1 2 3 0 1 0",

# alphabets, expecting that 9 is close to letters
"a t g q e h 9 u f",
"e q y u o i p s",
"q o 9 p l k j o k k o p",
"h g y i u t t a e q",
"i k d q r e 9 e a d",
"o p d g 9 s a f g a",
"i u y g h k l a s w",
"o l u y a o g f s",
"o p i u y g d a s j d l",
"u k i l o 9 l j s",
"y g i s h k j l f r f",
"i o h n 9 9 d 9 f a 9",

class CBOW(keras.Model):
def __init__(self, v_dim, emb_dim):
self.v_dim = v_dim
self.embeddings = keras.layers.Embedding(
input_dim=v_dim, output_dim=emb_dim, # [n_vocab, emb_dim]
embeddings_initializer=keras.initializers.RandomNormal(0., 0.1),

# noise-contrastive estimation
self.nce_w = self.add_weight(
name="nce_w", shape=[v_dim, emb_dim],
initializer=keras.initializers.TruncatedNormal(0., 0.1)) # [n_vocab, emb_dim]
self.nce_b = self.add_weight(
name="nce_b", shape=(v_dim,),
initializer=keras.initializers.Constant(0.1)) # [n_vocab, ]

self.opt = keras.optimizers.Adam(0.01)

def call(self, x, training=None, mask=None):
# x.shape = [n, skip_window*2]
o = self.embeddings(x) # [n, skip_window*2, emb_dim]
o = tf.reduce_mean(o, axis=1) # [n, emb_dim]
return o

# negative sampling: take one positive label and num_sampled negative labels to compute the loss
# in order to reduce the computation of full softmax
def loss(self, x, y, training=None):
embedded = self.call(x, training)
return tf.reduce_mean(
weights=self.nce_w, biases=self.nce_b, labels=tf.expand_dims(y, axis=1),
inputs=embedded, num_sampled=5, num_classes=self.v_dim))

def step(self, x, y):
with tf.GradientTape() as tape:
loss = self.loss(x, y, True)
grads = tape.gradient(loss, self.trainable_variables)
self.opt.apply_gradients(zip(grads, self.trainable_variables))
return loss.numpy()

def train(model, data):
for t in range(2500):
bx, by = data.sample(8)
loss = model.step(bx, by)
if t % 200 == 0:
print("step: {} | loss: {}".format(t, loss))

if __name__ == "__main__":
#skip_window 取词区间,2-5
d = process_w2v_data(corpus, skip_window=2, method="cbow")
m = CBOW(d.num_word, 2)
train(m, d)

# plotting
show_w2v_word_embedding(m, d, "./visual/results/cbow.png")
Author: Michelle19l
Link: https://gitee.com/michelle19l/michelle19l/2021/01/30/python/nlp/莫烦python/
Copyright Notice: All articles in this blog are licensed under CC BY-NC-SA 4.0 unless stating additionally.
