Skip to content

Commit 531e98d

Browse files
committed
update: faster histograms
1 parent 7cf77c0 commit 531e98d

File tree

5 files changed

+258
-97
lines changed

5 files changed

+258
-97
lines changed

include/kmtricks/cmd.hpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -178,14 +178,14 @@ struct main_count
178178
spdlog::debug("[push] - CountTask - S={}, P={}", opt->id, i);
179179
pool.add_task(std::make_shared<CountTask<MAX_K, DMAX_C, SuperKStorageReader>>(
180180
path, config, superk_storage, pinfo, i, KmDir::get().m_fof.get_i(opt->id),
181-
config._kmerSize, opt->c_ab_min, opt->lz4, hist, opt->clear));
181+
config._kmerSize, opt->c_ab_min, opt->lz4, hist->clone(), opt->clear));
182182
}
183183
else if (opt->format == "kff")
184184
{
185185
spdlog::debug("[push] - KffCountTask - S={}, P={}", opt->id, i);
186186
pool.add_task(std::make_shared<KffCountTask<MAX_K, DMAX_C, SuperKStorageReader>>(
187187
path, config, superk_storage, pinfo, i, KmDir::get().m_fof.get_i(opt->id),
188-
config._kmerSize, opt->c_ab_min, hist, opt->clear));
188+
config._kmerSize, opt->c_ab_min, hist->clone(), opt->clear));
189189
}
190190
}
191191
else if (opt->format == "hash" || opt->format == "vector")
@@ -199,21 +199,24 @@ struct main_count
199199
pool.add_task(std::make_shared<HashCountTask<MAX_K, DMAX_C, SuperKStorageReader>>(
200200
path, config, superk_storage, pinfo, i, KmDir::get().m_fof.get_i(opt->id),
201201
hw.get_window_size_bits(), config._kmerSize, opt->c_ab_min, opt->lz4,
202-
hist, opt->clear));
202+
hist->clone(), opt->clear));
203203
}
204204
else
205205
{
206206
spdlog::debug("[push] - HashVecCountTask - S={}, P={}", opt->id, i);
207207
pool.add_task(std::make_shared<HashVecCountTask<MAX_K, DMAX_C, SuperKStorageReader>>(
208208
path, config, superk_storage, pinfo, i, KmDir::get().m_fof.get_i(opt->id),
209-
hw.get_window_size_bits(), config._kmerSize, opt->c_ab_min, opt->lz4, hist, opt->clear));
209+
hw.get_window_size_bits(), config._kmerSize, opt->c_ab_min, opt->lz4, hist->clone(), opt->clear));
210210
}
211211
}
212212
}
213213
pool.join_all();
214214

215215
if (opt->hist)
216+
{
217+
hist->merge_clones();
216218
HistWriter(KmDir::get().get_hist_path(opt->id), *hist, false);
219+
}
217220
}
218221
};
219222

include/kmtricks/histogram.hpp

Lines changed: 152 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -29,74 +29,181 @@
2929

3030
namespace km {
3131

32+
enum class KHistType { UNIQUE, TOTAL };
33+
3234
class KHist
3335
{
36+
template<size_t buf_size>
37+
friend class HistWriter;
38+
39+
template<size_t buf_size>
40+
friend class HistReader;
41+
3442
public:
35-
KHist() {}
43+
KHist() = default;
3644
KHist(int idx, size_t ksize, size_t lower, size_t upper)
37-
: idx(idx), ksize(ksize), lower(lower), upper(upper)
45+
: m_idx(idx), m_ksize(ksize), m_lower(lower), m_upper(upper)
3846
{
39-
hist_u.resize(upper-lower+1, 0);
40-
hist_n.resize(upper-lower+1, 0);
47+
m_hist_u.resize(m_upper - m_lower + 1, 0);
48+
m_hist_n.resize(m_upper - m_lower + 1, 0);
4149
}
4250

4351
void inc(uint64_t count)
4452
{
45-
std::unique_lock<std::mutex> lock(m_mutex);
46-
uniq++;
47-
total += count;
48-
if (count < lower)
53+
m_uniq++;
54+
m_total += count;
55+
if (count < m_lower)
4956
{
50-
oob_lu++;
51-
oob_ln+=count;
57+
m_oob_lu++;
58+
m_oob_ln += count;
5259
}
53-
else if (count > upper)
60+
else if (count > m_upper)
5461
{
55-
oob_uu++;
56-
oob_un+=count;
62+
m_oob_uu++;
63+
m_oob_un += count;
5764
}
5865
else
5966
{
60-
hist_u[count-lower]++;
61-
hist_n[count-lower]+=count;
67+
m_hist_u[count - m_lower]++;
68+
m_hist_n[count - m_lower] += count;
6269
}
6370
}
6471

65-
void print_histu()
72+
void set_type(KHistType type)
6673
{
67-
_print(hist_u);
74+
m_type = type;
6875
}
6976

70-
void print_histn()
77+
std::shared_ptr<KHist> clone()
7178
{
72-
_print(hist_n);
79+
m_clone = true;
80+
std::shared_ptr<KHist> hist = std::make_shared<KHist>(m_idx, m_ksize, m_lower, m_upper);
81+
m_clones.push_back(hist);
82+
return hist;
7383
}
7484

75-
private:
76-
void _print(const std::vector<uint64_t>& v)
85+
uint64_t unique() const { return m_uniq; }
86+
uint64_t total() const { return m_total; }
87+
uint64_t lower() const { return m_lower; }
88+
uint64_t upper() const { return m_upper; }
89+
90+
uint64_t oob_lower_unique() const { return m_oob_lu; }
91+
uint64_t oob_upper_unique() const { return m_oob_uu; }
92+
uint64_t oob_lower_total() const { return m_oob_ln; }
93+
uint64_t oob_upper_total() const { return m_oob_un; }
94+
95+
uint32_t kmer_size() const { return m_ksize; }
96+
uint32_t idx() const { return m_idx; }
97+
98+
int64_t get_count(size_t c, KHistType type) const
99+
{
100+
if ((c < m_lower) || (c > m_upper))
101+
return -1;
102+
if (type == KHistType::UNIQUE)
103+
return m_hist_u[c];
104+
return m_hist_n[c];
105+
}
106+
107+
const std::vector<uint64_t>& get_vec(KHistType type = KHistType::UNIQUE) const
108+
{
109+
if (type == KHistType::UNIQUE)
110+
return m_hist_u;
111+
return m_hist_n;
112+
}
113+
114+
void merge_clones()
115+
{
116+
if (m_clone && !m_merged)
117+
{
118+
for (auto& h : m_clones)
119+
{
120+
m_uniq += h->m_uniq;
121+
m_total += h->m_total;
122+
m_oob_lu += h->m_oob_lu;
123+
m_oob_uu += h->m_oob_uu;
124+
m_oob_ln += h->m_oob_ln;
125+
m_oob_un += h->m_oob_un;
126+
for (size_t i=0; i<h->m_hist_u.size(); i++)
127+
{
128+
m_hist_u[i] += h->m_hist_u[i];
129+
m_hist_n[i] += h->m_hist_n[i];
130+
}
131+
}
132+
m_merged = true;
133+
clear_clones();
134+
}
135+
}
136+
137+
void clear_clones()
138+
{
139+
m_clones.clear();
140+
m_clone = false;
141+
m_merged = false;
142+
}
143+
144+
std::string as_string(KHistType type = KHistType::UNIQUE, const std::string sep = "\n") const
77145
{
78-
uint64_t current = lower;
79-
for_each(v.begin(), v.end(), [&current](uint64_t c){
80-
std::cerr << std::to_string(current) << " " << std::to_string(c) << "\n";
81-
current++;
146+
std::stringstream ss;
147+
uint64_t count = 0;
148+
auto vec = m_type == KHistType::UNIQUE ? m_hist_u : m_hist_n;
149+
std::for_each(vec.begin(), vec.end(), [&count, &ss, &sep](uint64_t c){
150+
ss << std::to_string(count) << " " << std::to_string(c) << sep;
151+
count++;
82152
});
83-
std::cerr << std::flush;
153+
return ss.str();
84154
}
85155

86-
public:
87-
int32_t idx {0};
88-
uint32_t ksize {0};
89-
uint64_t lower {0};
90-
uint64_t upper {0};
91-
uint64_t uniq {0};
92-
uint64_t total {0};
93-
uint64_t oob_lu {0};
94-
uint64_t oob_uu {0};
95-
uint64_t oob_ln {0};
96-
uint64_t oob_un {0};
97-
std::vector<uint64_t> hist_u;
98-
std::vector<uint64_t> hist_n;
99-
std::mutex m_mutex;
156+
void print(std::ostream& output_stream, KHistType type = KHistType::UNIQUE, const std::string sep = "\n") const
157+
{
158+
output_stream << as_string(type, sep);
159+
}
160+
161+
auto begin()
162+
{
163+
if (m_type == KHistType::UNIQUE)
164+
return m_hist_u.begin();
165+
return m_hist_n.begin();
166+
}
167+
168+
auto end()
169+
{
170+
if (m_type == KHistType::UNIQUE)
171+
return m_hist_u.end();
172+
return m_hist_n.end();
173+
}
174+
175+
auto cbegin() const
176+
{
177+
if (m_type == KHistType::UNIQUE)
178+
return m_hist_u.cbegin();
179+
return m_hist_n.cbegin();
180+
}
181+
182+
auto cend()
183+
{
184+
if (m_type == KHistType::UNIQUE)
185+
return m_hist_u.cend();
186+
return m_hist_n.cend();
187+
}
188+
189+
private:
190+
int32_t m_idx {0};
191+
uint32_t m_ksize {0};
192+
uint64_t m_lower {0};
193+
uint64_t m_upper {0};
194+
uint64_t m_uniq {0};
195+
uint64_t m_total {0};
196+
uint64_t m_oob_lu {0};
197+
uint64_t m_oob_uu {0};
198+
uint64_t m_oob_ln {0};
199+
uint64_t m_oob_un {0};
200+
201+
std::vector<uint64_t> m_hist_u;
202+
std::vector<uint64_t> m_hist_n;
203+
std::vector<std::shared_ptr<KHist>> m_clones;
204+
KHistType m_type {KHistType::UNIQUE};
205+
bool m_clone {false};
206+
bool m_merged {false};
100207
};
101208

102209
using hist_t = std::shared_ptr<KHist>;
@@ -109,15 +216,16 @@ inline std::vector<uint32_t> compute_merge_thresholds(std::vector<hist_t>& histo
109216
for (size_t h=0; h<histograms.size(); h++)
110217
{
111218
uint32_t sum = 0;
112-
uint32_t n = histograms[h]->uniq * p;
113-
for (size_t i=0; i<histograms[h]->hist_u.size(); i++)
219+
uint32_t n = histograms[h]->unique() * p;
220+
auto v = histograms[h]->get_vec(KHistType::UNIQUE);
221+
for (size_t i=0; i<v.size(); i++)
114222
{
115223
if (sum > n)
116224
{
117225
thresholds.push_back(i);
118226
break;
119227
}
120-
sum += histograms[h]->hist_u[i];
228+
sum += v[i];
121229
}
122230
}
123231
std::ofstream out(path, std::ios::out); check_fstream_good(path, out);
@@ -128,4 +236,4 @@ inline std::vector<uint32_t> compute_merge_thresholds(std::vector<hist_t>& histo
128236
return thresholds;
129237
}
130238

131-
};
239+
};

include/kmtricks/io/hist_file.hpp

Lines changed: 33 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -90,25 +90,25 @@ class HistWriter : public IFile<HistFileHeader, std::ostream, buf_size>
9090
: IFile<HistFileHeader, std::ostream, buf_size>(path, std::ios::out | std::ios::binary)
9191
{
9292
this->m_header.compressed = lz4;
93-
this->m_header.kmer_size = hist.ksize;
94-
this->m_header.id = hist.idx;
95-
this->m_header.lower = hist.lower;
96-
this->m_header.upper = hist.upper;
97-
this->m_header.uniq = hist.uniq;
98-
this->m_header.total = hist.total;
99-
this->m_header.oob_ln = hist.oob_ln;
100-
this->m_header.oob_lu = hist.oob_lu;
101-
this->m_header.oob_un = hist.oob_un;
102-
this->m_header.oob_uu = hist.oob_uu;
93+
this->m_header.kmer_size = hist.m_ksize;
94+
this->m_header.id = hist.m_idx;
95+
this->m_header.lower = hist.m_lower;
96+
this->m_header.upper = hist.m_upper;
97+
this->m_header.uniq = hist.m_uniq;
98+
this->m_header.total = hist.m_total;
99+
this->m_header.oob_ln = hist.m_oob_ln;
100+
this->m_header.oob_lu = hist.m_oob_lu;
101+
this->m_header.oob_un = hist.m_oob_un;
102+
this->m_header.oob_uu = hist.m_oob_uu;
103103

104104
this->m_header.serialize(this->m_first_layer.get());
105105

106106
this->template set_second_layer<ocstream>(this->m_header.compressed);
107107

108-
this->m_second_layer->write(reinterpret_cast<char*>(hist.hist_u.data()),
109-
hist.hist_u.size() * sizeof(uint64_t));
110-
this->m_second_layer->write(reinterpret_cast<char*>(hist.hist_n.data()),
111-
hist.hist_n.size() * sizeof(uint64_t));
108+
this->m_second_layer->write(reinterpret_cast<char*>(hist.m_hist_u.data()),
109+
hist.m_hist_u.size() * sizeof(uint64_t));
110+
this->m_second_layer->write(reinterpret_cast<char*>(hist.m_hist_n.data()),
111+
hist.m_hist_n.size() * sizeof(uint64_t));
112112
}
113113
};
114114

@@ -130,43 +130,46 @@ class HistReader : public IFile<HistFileHeader, std::istream, buf_size>
130130
{
131131
hist_t histo = std::make_shared<KHist>(this->m_header.id, this->m_header.kmer_size,
132132
this->m_header.lower, this->m_header.upper);
133-
histo->oob_lu = this->m_header.oob_lu;
134-
histo->oob_uu = this->m_header.oob_uu;
135-
histo->oob_ln = this->m_header.oob_ln;
136-
histo->oob_un = this->m_header.oob_un;
133+
histo->m_oob_lu = this->m_header.oob_lu;
134+
histo->m_oob_uu = this->m_header.oob_uu;
135+
histo->m_oob_ln = this->m_header.oob_ln;
136+
histo->m_oob_un = this->m_header.oob_un;
137137
this->m_second_layer->read(
138-
reinterpret_cast<char*>(histo->hist_u.data()), histo->hist_u.size()*sizeof(uint64_t));
138+
reinterpret_cast<char*>(histo->m_hist_u.data()),
139+
histo->m_hist_u.size()*sizeof(uint64_t));
139140
this->m_second_layer->read(
140-
reinterpret_cast<char*>(histo->hist_n.data()), histo->hist_n.size()*sizeof(uint64_t));
141+
reinterpret_cast<char*>(histo->m_hist_n.data()),
142+
histo->m_hist_n.size()*sizeof(uint64_t));
141143
return histo;
142144
}
143145

144146
void write_as_text(std::ostream& stream, bool n)
145147
{
146148
hist_t histo = get();
147-
uint64_t current = histo->lower;
148-
stream << "@LOWER=" << histo->lower << "\n";
149-
stream << "@UPPER=" << histo->upper << "\n";
149+
uint64_t current = histo->lower();
150+
stream << "@LOWER=" << histo->lower() << "\n";
151+
stream << "@UPPER=" << histo->upper() << "\n";
150152

151153
if (n)
152154
{
153-
stream << "@OOB_L=" << histo->oob_ln << "\n";
154-
stream << "@OOB_U=" << histo->oob_un << "\n";
155-
for_each(histo->hist_n.begin(), histo->hist_n.end(), [&current, &stream](uint64_t c) {
155+
histo->set_type(KHistType::TOTAL);
156+
stream << "@OOB_L=" << histo->oob_lower_total() << "\n";
157+
stream << "@OOB_U=" << histo->oob_upper_total() << "\n";
158+
for_each(histo->begin(), histo->end(), [&current, &stream](uint64_t c) {
156159
stream << std::to_string(current) << " " << std::to_string(c) << "\n";
157160
current++;
158161
});
159162
}
160163
else
161164
{
162-
stream << "@OOB_L=" << histo->oob_lu << "\n";
163-
stream << "@OOB_U=" << histo->oob_uu << "\n";
164-
for_each(histo->hist_u.begin(), histo->hist_u.end(), [&current, &stream](uint64_t c) {
165+
stream << "@OOB_L=" << histo->oob_lower_unique() << "\n";
166+
stream << "@OOB_U=" << histo->oob_upper_unique() << "\n";
167+
for_each(histo->begin(), histo->end(), [&current, &stream](uint64_t c) {
165168
stream << std::to_string(current) << " " << std::to_string(c) << "\n";
166169
current++;
167170
});
168171
}
169172
}
170173
};
171174

172-
};
175+
};

0 commit comments

Comments
 (0)