Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
...

## [v0.1.5]
### Added
- set a default value different from 0.0

## [v0.1.4]
### Fixed
- Updated README.md with better install instructions. Making release so
Expand Down Expand Up @@ -43,8 +47,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- release to pypi

[Unreleased]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.4...HEAD
[v0.1.3]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.4...v0.1.4
[Unreleased]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.5...HEAD
[v0.1.5]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.4...v0.1.5
[v0.1.4]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.3...v0.1.4
[v0.1.3]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.2...v0.1.3
[v0.1.2]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.1...v0.1.2
[v0.1.1]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.0...v0.1.1
Expand Down
3 changes: 2 additions & 1 deletion bigwig_loader/batch_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ def get_batch(
end: Union[Sequence[int], npt.NDArray[np.int64]],
window_size: int = 1,
scaling_factors_cupy: Optional[cp.ndarray] = None,
default_value: float = 0.0,
out: Optional[cp.ndarray] = None,
) -> cp.ndarray:
(
Expand Down Expand Up @@ -137,10 +138,10 @@ def get_batch(
query_starts=abs_start,
query_ends=abs_end,
window_size=window_size,
default_value=default_value,
out=out,
)
batch = cp.transpose(out, (1, 0, 2))
if scaling_factors_cupy is not None:
batch *= scaling_factors_cupy

return batch
5 changes: 4 additions & 1 deletion bigwig_loader/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ def get_batch(
start: Union[Sequence[int], npt.NDArray[np.int64]],
end: Union[Sequence[int], npt.NDArray[np.int64]],
window_size: int = 1,
default_value: float = 0.0,
out: Optional[cp.ndarray] = None,
) -> cp.ndarray:
return self.batch_processor.get_batch(
Expand All @@ -148,6 +149,7 @@ def get_batch(
end=end,
window_size=window_size,
scaling_factors_cupy=self.scaling_factors_cupy,
default_value=default_value,
out=out,
)

Expand All @@ -171,7 +173,8 @@ def make_positions_global(

"""
offsets = np.array(
[self.chromosome_offset_dict[chrom] for chrom in chromosomes]
[self.chromosome_offset_dict[chrom] for chrom in chromosomes],
dtype=np.int64,
)
return positions + offsets

Expand Down
17 changes: 12 additions & 5 deletions bigwig_loader/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Any
from typing import Callable
from typing import Iterator
from typing import Literal
from typing import Optional
from typing import Sequence
from typing import Union
Expand Down Expand Up @@ -38,16 +39,16 @@ class BigWigDataset:
reference_genome_path: path to fasta file containing the reference genome.
sequence_length: number of base pairs in input sequence
center_bin_to_predict: if given, only do prediction on a central window. Should be
smaller than or equal to sequence_length. If not given will be the same as
sequence_length.
smaller than or equal to sequence_length. If None, the whole sequence length
will be used. Default: None
window_size: used to down sample the resolution of the target from sequence_length
moving_average_window_size: window size for moving average on the target. Can
help too smooth out the target. Default: 1, which means no smoothing. If
used in combination with window_size, the target is first downsampled and
then smoothed.
batch_size: batch size
super_batch_size: batch size that is used in the background to load data from
bigwig files. Should be larget than or equal to batch_size. If None, it will
bigwig files. Should be larger than or equal to batch_size. If None, it will
be equal to batch_size.
batches_per_epoch: because the length of an epoch is slightly arbitrary here,
the number of batches can be set by hand. If not the number of batches per
Expand All @@ -61,6 +62,9 @@ class BigWigDataset:
If None, no scaling is done. Keys can be (partial) file paths. See
bigwig_loader.path.match_key_to_path for more information about how
dict keys are mapped to paths.
default_value: value to use for intervals that are not present in the
bigwig file. Defaults to 0.0. Can be set to cp.nan to differentiate
between missing values listed as 0.0.
first_n_files: Only use the first n files (handy for debugging on less tasks)
position_sampler_buffer_size: number of intervals picked up front by the position sampler.
When all intervals are used, new intervals are picked.
Expand All @@ -73,7 +77,7 @@ class BigWigDataset:
n_threads: number of python threads / cuda streams to use for loading the data to
GPU. More threads means that more IO can take place while the GPU is busy doing
calculations (decompressing or neural network training for example). More threads
also means a higher GPU memory usage.
also means a higher GPU memory usage. Default: 4
return_batch_objects: if True, the batches will be returned as instances of
bigwig_loader.batch.Batch
"""
Expand All @@ -92,11 +96,12 @@ def __init__(
batches_per_epoch: Optional[int] = None,
maximum_unknown_bases_fraction: float = 0.1,
sequence_encoder: Optional[
Union[Callable[[Sequence[str]], Any], str]
Union[Callable[[Sequence[str]], Any], Literal["onehot"]]
] = "onehot",
file_extensions: Sequence[str] = (".bigWig", ".bw"),
crawl: bool = True,
scale: Optional[dict[Union[str | Path], Any]] = None,
default_value: float = 0.0,
first_n_files: Optional[int] = None,
position_sampler_buffer_size: int = 100000,
repeat_same_positions: bool = False,
Expand Down Expand Up @@ -139,6 +144,7 @@ def __init__(
self._first_n_files = first_n_files
self._file_extensions = file_extensions
self._crawl = crawl
self._default_value = default_value
self._scale = scale
self._position_sampler_buffer_size = position_sampler_buffer_size
self._repeat_same_positions = repeat_same_positions
Expand Down Expand Up @@ -181,6 +187,7 @@ def _create_dataloader(self) -> StreamedDataloader:
queue_size=self._n_threads + 1,
slice_size=self.batch_size,
window_size=self.window_size,
default_value=self._default_value,
)

def __iter__(
Expand Down
13 changes: 8 additions & 5 deletions bigwig_loader/intervals_to_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@

CUDA_KERNEL_DIR = Path(__file__).parent.parent / "cuda_kernels"

_zero = cp.asarray(0.0, dtype=cp.float32).item()


def get_cuda_kernel() -> str:
with open(CUDA_KERNEL_DIR / "intervals_to_values.cu") as f:
Expand All @@ -31,6 +29,7 @@ def intervals_to_values(
found_ends: cp.ndarray | None = None,
sizes: cp.ndarray | None = None,
window_size: int = 1,
default_value: float = 0.0,
out: cp.ndarray | None = None,
) -> cp.ndarray:
"""
Expand Down Expand Up @@ -58,6 +57,8 @@ def intervals_to_values(
sizes: number of elements in track_starts/track_ends/track_values for each track.
Only needed when found_starts and found_ends are not given.
window_size: size in basepairs to average over (default: 1)
default_value: value to use for regions where no data is specified (default: 0.0)
out: array of size n_tracks x batch_size x sequence_length to store the output.
Returns:
out: array of size n_tracks x batch_size x sequence_length

Expand Down Expand Up @@ -85,12 +86,15 @@ def intervals_to_values(
)

if out is None:
out = cp.zeros(
out = cp.full(
(found_starts.shape[0], len(query_starts), sequence_length // window_size),
default_value,
dtype=cp.float32,
)
else:
out *= _zero
logging.debug(f"Setting default value in output tensor to {default_value}")
out.fill(default_value)
logging.debug(out)

max_number_intervals = min(
sequence_length, (found_ends - found_starts).max().item()
Expand Down Expand Up @@ -136,7 +140,6 @@ def intervals_to_values(
out,
),
)

return out


Expand Down
25 changes: 21 additions & 4 deletions bigwig_loader/pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,8 @@ class PytorchBigWigDataset(IterableDataset[BATCH_TYPE]):
reference_genome_path: path to fasta file containing the reference genome.
sequence_length: number of base pairs in input sequence
center_bin_to_predict: if given, only do prediction on a central window. Should be
smaller than or equal to sequence_length. If not given will be the same as
sequence_length.
smaller than or equal to sequence_length. If None, the whole sequence length
will be used. Default: None
window_size: used to down sample the resolution of the target from sequence_length
moving_average_window_size: window size for moving average on the target. Can
help too smooth out the target. Default: 1, which means no smoothing. If
Expand All @@ -141,8 +141,21 @@ class PytorchBigWigDataset(IterableDataset[BATCH_TYPE]):
maximum_unknown_bases_fraction: maximum number of bases in an input sequence that
is unknown.
sequence_encoder: encoder to apply to the sequence. Default: bigwig_loader.util.onehot_sequences
position_samples_buffer_size: number of intervals picked up front by the position sampler.
file_extensions: load files with these extensions (default .bw and .bigWig)
crawl: whether to search in sub-directories for BigWig files
scale: Optional, dictionary with scaling factors for each BigWig file.
If None, no scaling is done. Keys can be (partial) file paths. See
bigwig_loader.path.match_key_to_path for more information about how
dict keys are mapped to paths.
default_value: value to use for intervals that are not present in the
bigwig file. Defaults to 0.0. Can be set to cp.nan to differentiate
between missing values listed as 0.0.
first_n_files: Only use the first n files (handy for debugging on less tasks)
position_sampler_buffer_size: number of intervals picked up front by the position sampler.
When all intervals are used, new intervals are picked.
repeat_same_positions: if True the positions sampler does not draw a new random collection
of positions when the buffer runs out, but repeats the same samples. Can be used to
check whether network can overfit.
sub_sample_tracks: int, if set a different random set of tracks is selected in each
superbatch from the total number of tracks. The indices corresponding to those tracks
are returned in the output.
Expand All @@ -160,7 +173,7 @@ def __init__(
collection: Union[str, Sequence[str], Path, Sequence[Path], BigWigCollection],
reference_genome_path: Path,
sequence_length: int = 1000,
center_bin_to_predict: Optional[int] = 200,
center_bin_to_predict: Optional[int] = None,
window_size: int = 1,
moving_average_window_size: int = 1,
batch_size: int = 256,
Expand All @@ -172,6 +185,8 @@ def __init__(
] = "onehot",
file_extensions: Sequence[str] = (".bigWig", ".bw"),
crawl: bool = True,
scale: Optional[dict[Union[str | Path], Any]] = None,
default_value: float = 0.0,
first_n_files: Optional[int] = None,
position_sampler_buffer_size: int = 100000,
repeat_same_positions: bool = False,
Expand All @@ -195,6 +210,8 @@ def __init__(
sequence_encoder=sequence_encoder,
file_extensions=file_extensions,
crawl=crawl,
scale=scale,
default_value=default_value,
first_n_files=first_n_files,
position_sampler_buffer_size=position_sampler_buffer_size,
repeat_same_positions=repeat_same_positions,
Expand Down
5 changes: 4 additions & 1 deletion bigwig_loader/streamed_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def __init__(
queue_size: int = 10,
slice_size: int | None = None,
window_size: int = 1,
default_value: float = 0.0,
):
self.input_generator = input_generator
self.collection = collection
Expand All @@ -127,6 +128,7 @@ def __init__(
self.data_generator_thread: threading.Thread | None = None
self._entered = False
self._out = None
self._default_value = default_value

def __enter__(self) -> "StreamedDataloader":
self._entered = True
Expand Down Expand Up @@ -214,6 +216,7 @@ def _generate_batches(self) -> Generator[Batch, None, None]:
found_starts=found_starts[:, select],
found_ends=found_ends[:, select],
window_size=self.window_size,
default_value=self._default_value,
out=out,
)

Expand Down Expand Up @@ -248,7 +251,7 @@ def _get_out_tensor(
shape = (number_of_tracks, batch_size, sequence_length)

if self._out is None or self._out.shape != shape:
self._out = cp.zeros(shape, dtype=cp.float32)
self._out = cp.full(shape, self._default_value, dtype=cp.float32)
return self._out

def _determine_slice_size(self, n_samples: int) -> int:
Expand Down
23 changes: 23 additions & 0 deletions tests/test_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,29 @@ def test_get_batch(collection):
assert batch.shape == (256, n_files, 1000)


def test_get_batch_with_nans(collection):
"""
Testing whether NaNs are returned when setting
the default_value to cp.nan. Giving it some
coordinates way beyond the total chromosome
length now because the bigwigs I am testing
on include intervals with value 0.0 instead
of just not listing it.
"""
n_files = len(collection.bigwig_paths)

batch = collection.get_batch(
["chr1", "chr20", "chr4"],
[0, 99998999, 99998999],
[1000, 99999999, 99999999],
default_value=cp.nan,
)

print(batch)
assert batch.shape == (3, n_files, 1000)
assert cp.any(cp.isnan(batch))


def test_exclude_intervals(collection):
intervals = collection.intervals(
["chr3", "chr4", "chr5"], exclude_chromosomes=["chr4"]
Expand Down
31 changes: 31 additions & 0 deletions tests/test_intervals_to_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,3 +230,34 @@ def test_get_values_from_intervals_batch_multiple_tracks() -> None:
print(expected)
print(values)
assert (values == expected).all()


def test_default_nan() -> None:
"""Query end is exactly at end index before "gap"
Now instead of zeros, NaN values should be
used.
."""
track_starts = cp.asarray([5, 10, 12, 18], dtype=cp.int32)
track_ends = cp.asarray([10, 12, 14, 20], dtype=cp.int32)
track_values = cp.asarray([20.0, 30.0, 40.0, 50.0], dtype=cp.dtype("f4"))
query_starts = cp.asarray([7, 9], dtype=cp.int32)
query_ends = cp.asarray([18, 20], dtype=cp.int32)
reserved = cp.zeros([2, 11], dtype=cp.dtype("<f4"))
values = intervals_to_values(
track_starts,
track_ends,
track_values,
query_starts,
query_ends,
default_value=cp.nan,
out=reserved,
)
expected = cp.asarray(
[
[20.0, 20.0, 20.0, 30.0, 30.0, 40.0, 40.0, cp.nan, cp.nan, cp.nan, cp.nan],
[20.0, 30.0, 30.0, 40.0, 40.0, cp.nan, cp.nan, cp.nan, cp.nan, 50.0, 50.0],
]
)
print(expected)
print(values)
assert cp.allclose(values, expected, equal_nan=True)