text_corpus = [ "Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey", ]
# set停用词 stoplist = set('for a of the and to in'.split(' ')) # 原文档处理(小写化,空格分隔,去除停用词) texts = [[word for word in document.lower().split() if word notin stoplist] for document in text_corpus]
# 统计词频 from collections import defaultdict frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1
# 仅保留出现过一次以上的词 processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts] print(processed_corpus)
文档3: "System and human system engineering testing of EPS"
语料库与向量空间
从字符串到向量
1 2 3 4 5 6 7 8 9 10 11
documents = [ "Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey", ]
from pprint import pprint #pretty-printer from collections import defaultdict
#去除常见词与标记 stoplist = set('for a of the and to in',split()) texts = { [word for word in document,lower().split() if word notin stoplist] for document in documents }
#移除只出现过一次的词 frequency = defaultdict(int) for text in texts: for token in text: frequency[token] +=1
texts = [ [token for token in text if frequency[token] > 1] for text in texts ] pprint(texts)
from collections import defaultdict from gensim import corpora
documents = [ "Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey", ]
# remove common words and tokenize stoplist = set('for a of the and to in'.split()) texts = [ [word for word in document.lower().split() if word notin stoplist] for document in documents ]
# remove words that appear only once frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1
texts = [ [token for token in text if frequency[token] > 1] for text in texts ]
dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts]
for doc,as_text inzip(corpus_lsi,documents): print(doc,as_text)
1 2 3 4 5 6 7 8 9
[(0, 0.06600783396090243), (1, -0.5200703306361859)] Human machine interface for lab abc computer applications [(0, 0.19667592859142288), (1, -0.7609563167700061)] A survey of user opinion of computer system response time [(0, 0.08992639972446298), (1, -0.724186062675251)] The EPS user interface management system [(0, 0.07585847652178054), (1, -0.6320551586003424)] System and human system engineering testing of EPS [(0, 0.1015029918497996), (1, -0.5737308483002966)] Relation of user perceived response time to error measurement [(0, 0.7032108939378314), (1, 0.16115180214025712)] The generation of random binary unordered trees [(0, 0.8774787673119835), (1, 0.16758906864659295)] The intersection graph of paths in trees [(0, 0.9098624686818582), (1, 0.14086553628718873)] Graph minors IV Widths of trees and well quasi ordering [(0, 0.6165825350569278), (1, -0.05392907566389511)] Graph minors A survey
模型持久化
通过save()和load()实现
1 2 3 4 5 6 7 8 9
import os import tempfile
with tempfile.NamedTemporaryFile(prefix='model-', suffix='.lsi', delete=False) as tmp: lsi_model.save(tmp.name)
loaded_lsi_model = models.LsiModel.load(tmp.name)
os.unlink(tmp.name)
可用替换
Gensim实现了集中流行的向量空间模型算法:
Term Frequency * Inverse Document Frequency,Tf-Idf
1
model = models.TfidfModel(corpus, normalize=True)
潜在语义索引(Latent Semantic Indexing,LSI(or sometimes LSA))
1
model = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)
from collections import defaultdict from gensim import corpora
documents = [ "Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey", ]
# remove common words and tokenize stoplist = set('for a of the and to in'.split()) texts = [ [word for word in document.lower().split() if word notin stoplist] for document in documents ]
# remove words that appear only once frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1
texts = [ [token for token in text if frequency[token] > 1] for text in texts ]
dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts]
sims = sorted(enumerate(sims), key=lambda item: -item[1]) for i, s inenumerate(sims): print(s, documents[i])
1 2 3 4 5 6 7 8 9
(2, 0.9984453) Human machine interface for lab abc computer applications (0, 0.998093) A survey of user opinion of computer system response time (3, 0.9865886) The EPS user interface management system (1, 0.93748635) System and human system engineering testing of EPS (4, 0.90755945) Relation of user perceived response time to error measurement (8, 0.050041765) The generation of random binary unordered trees (7, -0.09879464) The intersection graph of paths in trees (6, -0.10639259) Graph minors IV Widths of trees and well quasi ordering (5, -0.12416792) Graph minors A survey