Skip to content

Commit 04a3c1a

Browse files
piskvorkytmylk
authored andcommitted
code style fixes to the tfidf module (#1313)
1 parent 93ec566 commit 04a3c1a

File tree

1 file changed

+16
-11
lines changed

1 file changed

+16
-11
lines changed

gensim/models/tfidfmodel.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,7 @@ def precompute_idfs(wglobal, dfs, total_docs):
2828
"""Precompute the inverse document frequency mapping for all terms."""
2929
# not strictly necessary and could be computed on the fly in TfidfModel__getitem__.
3030
# this method is here just to speed things up a little.
31-
return dict((termid, wglobal(df, total_docs))
32-
for termid, df in iteritems(dfs))
31+
return dict((termid, wglobal(df, total_docs)) for termid, df in iteritems(dfs))
3332

3433

3534
class TfidfModel(interfaces.TransformationABC):
@@ -49,8 +48,9 @@ class TfidfModel(interfaces.TransformationABC):
4948
5049
Model persistency is achieved via its load/save methods.
5150
"""
52-
def __init__(self, corpus=None, id2word=None, dictionary=None,
53-
wlocal=utils.identity, wglobal=df2idf, normalize=True):
51+
def __init__(
52+
self, corpus=None, id2word=None, dictionary=None,
53+
wlocal=utils.identity, wglobal=df2idf, normalize=True):
5454
"""
5555
Compute tf-idf by multiplying a local component (term frequency) with a
5656
global component (inverse document frequency), and normalizing
@@ -87,11 +87,13 @@ def __init__(self, corpus=None, id2word=None, dictionary=None,
8787
# statistics we need to construct the IDF mapping. we can skip the
8888
# step that goes through the corpus (= an optimization).
8989
if corpus is not None:
90-
logger.warning("constructor received both corpus and explicit "
91-
"inverse document frequencies; ignoring the corpus")
90+
logger.warning(
91+
"constructor received both corpus and explicit inverse document frequencies; ignoring the corpus")
9292
self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz
9393
self.dfs = dictionary.dfs.copy()
9494
self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
95+
if id2word is None:
96+
self.id2word = dictionary
9597
elif corpus is not None:
9698
self.initialize(corpus)
9799
else:
@@ -114,7 +116,7 @@ def initialize(self, corpus):
114116
numnnz, docno = 0, -1
115117
for docno, bow in enumerate(corpus):
116118
if docno % 10000 == 0:
117-
logger.info("PROGRESS: processing document #%i" % docno)
119+
logger.info("PROGRESS: processing document #%i", docno)
118120
numnnz += len(bow)
119121
for termid, _ in bow:
120122
dfs[termid] = dfs.get(termid, 0) + 1
@@ -126,8 +128,9 @@ def initialize(self, corpus):
126128

127129
# and finally compute the idf weights
128130
n_features = max(dfs) if dfs else 0
129-
logger.info("calculating IDF weights for %i documents and %i features (%i matrix non-zeros)" %
130-
(self.num_docs, n_features, self.num_nnz))
131+
logger.info(
132+
"calculating IDF weights for %i documents and %i features (%i matrix non-zeros)",
133+
self.num_docs, n_features, self.num_nnz)
131134
self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
132135

133136

@@ -142,8 +145,10 @@ def __getitem__(self, bow, eps=1e-12):
142145

143146
# unknown (new) terms will be given zero weight (NOT infinity/huge weight,
144147
# as strict application of the IDF formula would dictate)
145-
vector = [(termid, self.wlocal(tf) * self.idfs.get(termid))
146-
for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0]
148+
vector = [
149+
(termid, self.wlocal(tf) * self.idfs.get(termid))
150+
for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0
151+
]
147152

148153
# and finally, normalize the vector either to unit length, or use a
149154
# user-defined normalization function

0 commit comments

Comments
 (0)