@@ -28,8 +28,7 @@ def precompute_idfs(wglobal, dfs, total_docs):
2828 """Precompute the inverse document frequency mapping for all terms."""
2929 # not strictly necessary and could be computed on the fly in TfidfModel__getitem__.
3030 # this method is here just to speed things up a little.
31- return dict ((termid , wglobal (df , total_docs ))
32- for termid , df in iteritems (dfs ))
31+ return dict ((termid , wglobal (df , total_docs )) for termid , df in iteritems (dfs ))
3332
3433
3534class TfidfModel (interfaces .TransformationABC ):
@@ -49,8 +48,9 @@ class TfidfModel(interfaces.TransformationABC):
4948
5049 Model persistency is achieved via its load/save methods.
5150 """
52- def __init__ (self , corpus = None , id2word = None , dictionary = None ,
53- wlocal = utils .identity , wglobal = df2idf , normalize = True ):
51+ def __init__ (
52+ self , corpus = None , id2word = None , dictionary = None ,
53+ wlocal = utils .identity , wglobal = df2idf , normalize = True ):
5454 """
5555 Compute tf-idf by multiplying a local component (term frequency) with a
5656 global component (inverse document frequency), and normalizing
@@ -87,11 +87,13 @@ def __init__(self, corpus=None, id2word=None, dictionary=None,
8787 # statistics we need to construct the IDF mapping. we can skip the
8888 # step that goes through the corpus (= an optimization).
8989 if corpus is not None :
90- logger .warning ("constructor received both corpus and explicit "
91- " inverse document frequencies; ignoring the corpus" )
90+ logger .warning (
91+ "constructor received both corpus and explicit inverse document frequencies; ignoring the corpus" )
9292 self .num_docs , self .num_nnz = dictionary .num_docs , dictionary .num_nnz
9393 self .dfs = dictionary .dfs .copy ()
9494 self .idfs = precompute_idfs (self .wglobal , self .dfs , self .num_docs )
95+ if id2word is None :
96+ self .id2word = dictionary
9597 elif corpus is not None :
9698 self .initialize (corpus )
9799 else :
@@ -114,7 +116,7 @@ def initialize(self, corpus):
114116 numnnz , docno = 0 , - 1
115117 for docno , bow in enumerate (corpus ):
116118 if docno % 10000 == 0 :
117- logger .info ("PROGRESS: processing document #%i" % docno )
119+ logger .info ("PROGRESS: processing document #%i" , docno )
118120 numnnz += len (bow )
119121 for termid , _ in bow :
120122 dfs [termid ] = dfs .get (termid , 0 ) + 1
@@ -126,8 +128,9 @@ def initialize(self, corpus):
126128
127129 # and finally compute the idf weights
128130 n_features = max (dfs ) if dfs else 0
129- logger .info ("calculating IDF weights for %i documents and %i features (%i matrix non-zeros)" %
130- (self .num_docs , n_features , self .num_nnz ))
131+ logger .info (
132+ "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)" ,
133+ self .num_docs , n_features , self .num_nnz )
131134 self .idfs = precompute_idfs (self .wglobal , self .dfs , self .num_docs )
132135
133136
@@ -142,8 +145,10 @@ def __getitem__(self, bow, eps=1e-12):
142145
143146 # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
144147 # as strict application of the IDF formula would dictate)
145- vector = [(termid , self .wlocal (tf ) * self .idfs .get (termid ))
146- for termid , tf in bow if self .idfs .get (termid , 0.0 ) != 0.0 ]
148+ vector = [
149+ (termid , self .wlocal (tf ) * self .idfs .get (termid ))
150+ for termid , tf in bow if self .idfs .get (termid , 0.0 ) != 0.0
151+ ]
147152
148153 # and finally, normalize the vector either to unit length, or use a
149154 # user-defined normalization function
0 commit comments