diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e75736..0a81bec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ... +## [v0.1.5] +### Added +- set a default value different from 0.0 + ## [v0.1.4] ### Fixed - Updated README.md with better install instructions. Making release so @@ -43,8 +47,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - release to pypi -[Unreleased]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.4...HEAD -[v0.1.3]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.4...v0.1.4 +[Unreleased]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.5...HEAD +[v0.1.5]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.4...v0.1.5 +[v0.1.4]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.3...v0.1.4 [v0.1.3]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.2...v0.1.3 [v0.1.2]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.1...v0.1.2 [v0.1.1]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.0...v0.1.1 diff --git a/bigwig_loader/batch_processor.py b/bigwig_loader/batch_processor.py index a66715c..082916d 100644 --- a/bigwig_loader/batch_processor.py +++ b/bigwig_loader/batch_processor.py @@ -104,6 +104,7 @@ def get_batch( end: Union[Sequence[int], npt.NDArray[np.int64]], window_size: int = 1, scaling_factors_cupy: Optional[cp.ndarray] = None, + default_value: float = 0.0, out: Optional[cp.ndarray] = None, ) -> cp.ndarray: ( @@ -137,10 +138,10 @@ def get_batch( query_starts=abs_start, query_ends=abs_end, window_size=window_size, + default_value=default_value, out=out, ) batch = cp.transpose(out, (1, 0, 2)) if scaling_factors_cupy is not None: batch *= scaling_factors_cupy - return batch diff --git a/bigwig_loader/collection.py b/bigwig_loader/collection.py index 24ab99f..6d6fcf0 100644 --- a/bigwig_loader/collection.py +++ b/bigwig_loader/collection.py @@ -140,6 +140,7 @@ def get_batch( start: Union[Sequence[int], npt.NDArray[np.int64]], end: Union[Sequence[int], npt.NDArray[np.int64]], window_size: int = 1, + default_value: float = 0.0, out: Optional[cp.ndarray] = None, ) -> cp.ndarray: return self.batch_processor.get_batch( @@ -148,6 +149,7 @@ def get_batch( end=end, window_size=window_size, scaling_factors_cupy=self.scaling_factors_cupy, + default_value=default_value, out=out, ) @@ -171,7 +173,8 @@ def make_positions_global( """ offsets = np.array( - [self.chromosome_offset_dict[chrom] for chrom in chromosomes] + [self.chromosome_offset_dict[chrom] for chrom in chromosomes], + dtype=np.int64, ) return positions + offsets diff --git a/bigwig_loader/dataset.py b/bigwig_loader/dataset.py index 2c5a0f5..787f0bf 100644 --- a/bigwig_loader/dataset.py +++ b/bigwig_loader/dataset.py @@ -3,6 +3,7 @@ from typing import Any from typing import Callable from typing import Iterator +from typing import Literal from typing import Optional from typing import Sequence from typing import Union @@ -38,8 +39,8 @@ class BigWigDataset: reference_genome_path: path to fasta file containing the reference genome. sequence_length: number of base pairs in input sequence center_bin_to_predict: if given, only do prediction on a central window. Should be - smaller than or equal to sequence_length. If not given will be the same as - sequence_length. + smaller than or equal to sequence_length. If None, the whole sequence length + will be used. Default: None window_size: used to down sample the resolution of the target from sequence_length moving_average_window_size: window size for moving average on the target. Can help too smooth out the target. Default: 1, which means no smoothing. If @@ -47,7 +48,7 @@ class BigWigDataset: then smoothed. batch_size: batch size super_batch_size: batch size that is used in the background to load data from - bigwig files. Should be larget than or equal to batch_size. If None, it will + bigwig files. Should be larger than or equal to batch_size. If None, it will be equal to batch_size. batches_per_epoch: because the length of an epoch is slightly arbitrary here, the number of batches can be set by hand. If not the number of batches per @@ -61,6 +62,9 @@ class BigWigDataset: If None, no scaling is done. Keys can be (partial) file paths. See bigwig_loader.path.match_key_to_path for more information about how dict keys are mapped to paths. + default_value: value to use for intervals that are not present in the + bigwig file. Defaults to 0.0. Can be set to cp.nan to differentiate + between missing values listed as 0.0. first_n_files: Only use the first n files (handy for debugging on less tasks) position_sampler_buffer_size: number of intervals picked up front by the position sampler. When all intervals are used, new intervals are picked. @@ -73,7 +77,7 @@ class BigWigDataset: n_threads: number of python threads / cuda streams to use for loading the data to GPU. More threads means that more IO can take place while the GPU is busy doing calculations (decompressing or neural network training for example). More threads - also means a higher GPU memory usage. + also means a higher GPU memory usage. Default: 4 return_batch_objects: if True, the batches will be returned as instances of bigwig_loader.batch.Batch """ @@ -92,11 +96,12 @@ def __init__( batches_per_epoch: Optional[int] = None, maximum_unknown_bases_fraction: float = 0.1, sequence_encoder: Optional[ - Union[Callable[[Sequence[str]], Any], str] + Union[Callable[[Sequence[str]], Any], Literal["onehot"]] ] = "onehot", file_extensions: Sequence[str] = (".bigWig", ".bw"), crawl: bool = True, scale: Optional[dict[Union[str | Path], Any]] = None, + default_value: float = 0.0, first_n_files: Optional[int] = None, position_sampler_buffer_size: int = 100000, repeat_same_positions: bool = False, @@ -139,6 +144,7 @@ def __init__( self._first_n_files = first_n_files self._file_extensions = file_extensions self._crawl = crawl + self._default_value = default_value self._scale = scale self._position_sampler_buffer_size = position_sampler_buffer_size self._repeat_same_positions = repeat_same_positions @@ -181,6 +187,7 @@ def _create_dataloader(self) -> StreamedDataloader: queue_size=self._n_threads + 1, slice_size=self.batch_size, window_size=self.window_size, + default_value=self._default_value, ) def __iter__( diff --git a/bigwig_loader/intervals_to_values.py b/bigwig_loader/intervals_to_values.py index 08c72e0..bc70ab6 100644 --- a/bigwig_loader/intervals_to_values.py +++ b/bigwig_loader/intervals_to_values.py @@ -8,8 +8,6 @@ CUDA_KERNEL_DIR = Path(__file__).parent.parent / "cuda_kernels" -_zero = cp.asarray(0.0, dtype=cp.float32).item() - def get_cuda_kernel() -> str: with open(CUDA_KERNEL_DIR / "intervals_to_values.cu") as f: @@ -31,6 +29,7 @@ def intervals_to_values( found_ends: cp.ndarray | None = None, sizes: cp.ndarray | None = None, window_size: int = 1, + default_value: float = 0.0, out: cp.ndarray | None = None, ) -> cp.ndarray: """ @@ -58,6 +57,8 @@ def intervals_to_values( sizes: number of elements in track_starts/track_ends/track_values for each track. Only needed when found_starts and found_ends are not given. window_size: size in basepairs to average over (default: 1) + default_value: value to use for regions where no data is specified (default: 0.0) + out: array of size n_tracks x batch_size x sequence_length to store the output. Returns: out: array of size n_tracks x batch_size x sequence_length @@ -85,12 +86,15 @@ def intervals_to_values( ) if out is None: - out = cp.zeros( + out = cp.full( (found_starts.shape[0], len(query_starts), sequence_length // window_size), + default_value, dtype=cp.float32, ) else: - out *= _zero + logging.debug(f"Setting default value in output tensor to {default_value}") + out.fill(default_value) + logging.debug(out) max_number_intervals = min( sequence_length, (found_ends - found_starts).max().item() @@ -136,7 +140,6 @@ def intervals_to_values( out, ), ) - return out diff --git a/bigwig_loader/pytorch.py b/bigwig_loader/pytorch.py index 7b68c49..474a1cd 100644 --- a/bigwig_loader/pytorch.py +++ b/bigwig_loader/pytorch.py @@ -124,8 +124,8 @@ class PytorchBigWigDataset(IterableDataset[BATCH_TYPE]): reference_genome_path: path to fasta file containing the reference genome. sequence_length: number of base pairs in input sequence center_bin_to_predict: if given, only do prediction on a central window. Should be - smaller than or equal to sequence_length. If not given will be the same as - sequence_length. + smaller than or equal to sequence_length. If None, the whole sequence length + will be used. Default: None window_size: used to down sample the resolution of the target from sequence_length moving_average_window_size: window size for moving average on the target. Can help too smooth out the target. Default: 1, which means no smoothing. If @@ -141,8 +141,21 @@ class PytorchBigWigDataset(IterableDataset[BATCH_TYPE]): maximum_unknown_bases_fraction: maximum number of bases in an input sequence that is unknown. sequence_encoder: encoder to apply to the sequence. Default: bigwig_loader.util.onehot_sequences - position_samples_buffer_size: number of intervals picked up front by the position sampler. + file_extensions: load files with these extensions (default .bw and .bigWig) + crawl: whether to search in sub-directories for BigWig files + scale: Optional, dictionary with scaling factors for each BigWig file. + If None, no scaling is done. Keys can be (partial) file paths. See + bigwig_loader.path.match_key_to_path for more information about how + dict keys are mapped to paths. + default_value: value to use for intervals that are not present in the + bigwig file. Defaults to 0.0. Can be set to cp.nan to differentiate + between missing values listed as 0.0. + first_n_files: Only use the first n files (handy for debugging on less tasks) + position_sampler_buffer_size: number of intervals picked up front by the position sampler. When all intervals are used, new intervals are picked. + repeat_same_positions: if True the positions sampler does not draw a new random collection + of positions when the buffer runs out, but repeats the same samples. Can be used to + check whether network can overfit. sub_sample_tracks: int, if set a different random set of tracks is selected in each superbatch from the total number of tracks. The indices corresponding to those tracks are returned in the output. @@ -160,7 +173,7 @@ def __init__( collection: Union[str, Sequence[str], Path, Sequence[Path], BigWigCollection], reference_genome_path: Path, sequence_length: int = 1000, - center_bin_to_predict: Optional[int] = 200, + center_bin_to_predict: Optional[int] = None, window_size: int = 1, moving_average_window_size: int = 1, batch_size: int = 256, @@ -172,6 +185,8 @@ def __init__( ] = "onehot", file_extensions: Sequence[str] = (".bigWig", ".bw"), crawl: bool = True, + scale: Optional[dict[Union[str | Path], Any]] = None, + default_value: float = 0.0, first_n_files: Optional[int] = None, position_sampler_buffer_size: int = 100000, repeat_same_positions: bool = False, @@ -195,6 +210,8 @@ def __init__( sequence_encoder=sequence_encoder, file_extensions=file_extensions, crawl=crawl, + scale=scale, + default_value=default_value, first_n_files=first_n_files, position_sampler_buffer_size=position_sampler_buffer_size, repeat_same_positions=repeat_same_positions, diff --git a/bigwig_loader/streamed_dataset.py b/bigwig_loader/streamed_dataset.py index 4def95e..314058a 100644 --- a/bigwig_loader/streamed_dataset.py +++ b/bigwig_loader/streamed_dataset.py @@ -108,6 +108,7 @@ def __init__( queue_size: int = 10, slice_size: int | None = None, window_size: int = 1, + default_value: float = 0.0, ): self.input_generator = input_generator self.collection = collection @@ -127,6 +128,7 @@ def __init__( self.data_generator_thread: threading.Thread | None = None self._entered = False self._out = None + self._default_value = default_value def __enter__(self) -> "StreamedDataloader": self._entered = True @@ -214,6 +216,7 @@ def _generate_batches(self) -> Generator[Batch, None, None]: found_starts=found_starts[:, select], found_ends=found_ends[:, select], window_size=self.window_size, + default_value=self._default_value, out=out, ) @@ -248,7 +251,7 @@ def _get_out_tensor( shape = (number_of_tracks, batch_size, sequence_length) if self._out is None or self._out.shape != shape: - self._out = cp.zeros(shape, dtype=cp.float32) + self._out = cp.full(shape, self._default_value, dtype=cp.float32) return self._out def _determine_slice_size(self, n_samples: int) -> int: diff --git a/tests/test_collection.py b/tests/test_collection.py index 0be7002..1e9893e 100644 --- a/tests/test_collection.py +++ b/tests/test_collection.py @@ -57,6 +57,29 @@ def test_get_batch(collection): assert batch.shape == (256, n_files, 1000) +def test_get_batch_with_nans(collection): + """ + Testing whether NaNs are returned when setting + the default_value to cp.nan. Giving it some + coordinates way beyond the total chromosome + length now because the bigwigs I am testing + on include intervals with value 0.0 instead + of just not listing it. + """ + n_files = len(collection.bigwig_paths) + + batch = collection.get_batch( + ["chr1", "chr20", "chr4"], + [0, 99998999, 99998999], + [1000, 99999999, 99999999], + default_value=cp.nan, + ) + + print(batch) + assert batch.shape == (3, n_files, 1000) + assert cp.any(cp.isnan(batch)) + + def test_exclude_intervals(collection): intervals = collection.intervals( ["chr3", "chr4", "chr5"], exclude_chromosomes=["chr4"] diff --git a/tests/test_intervals_to_values.py b/tests/test_intervals_to_values.py index f2e70f5..9ef5215 100644 --- a/tests/test_intervals_to_values.py +++ b/tests/test_intervals_to_values.py @@ -230,3 +230,34 @@ def test_get_values_from_intervals_batch_multiple_tracks() -> None: print(expected) print(values) assert (values == expected).all() + + +def test_default_nan() -> None: + """Query end is exactly at end index before "gap" + Now instead of zeros, NaN values should be + used. + .""" + track_starts = cp.asarray([5, 10, 12, 18], dtype=cp.int32) + track_ends = cp.asarray([10, 12, 14, 20], dtype=cp.int32) + track_values = cp.asarray([20.0, 30.0, 40.0, 50.0], dtype=cp.dtype("f4")) + query_starts = cp.asarray([7, 9], dtype=cp.int32) + query_ends = cp.asarray([18, 20], dtype=cp.int32) + reserved = cp.zeros([2, 11], dtype=cp.dtype("