Merge pull request #92 from ArcInstitute/dev

abearab · web-flow · commit 1b47fb976752 · 2024-08-13T20:07:30.000-07:00
minor bug fixes
diff --git a/README.md b/README.md
@@ -185,6 +185,8 @@ The first step in analyzing CRISPR screens with deep sequencing readouts is to p
 
 </details>
 
+<br>
+
 ### Step 2: Phenotype calculation
 
 Once you have the counts, you can use ScreenPro2 `phenoscore` and `phenostats` modules to calculate the phenotype scores and statistics between screen arms.
@@ -298,6 +300,8 @@ Once you have the counts, you can use ScreenPro2 `phenoscore` and `phenostats` m
   Last but not least, ScreenPro2 runs faster than ScreenProcessing (thanks to [biobear](https://github.com/wheretrue/biobear)) for processing FASTQ files. -->
 
 
+<br>
+
 ### Step 3: Data visualization
 
 Once the phenotypes are calculated, you can extract and explore the results using the `.phenotypes` attribute of the `PooledScreens` object. Currently, there are very limited functionalities built-in to visualize the results, but we are working on adding more features to make it easier for users. However, you can easily extract the results and use other libraries like `seaborn` and `matplotlib` in Python or `ggplot2` in R to visualize the results.
diff --git a/screenpro/__init__.py b/screenpro/__init__.py
@@ -31,6 +31,6 @@
 from .dashboard import DrugScreenDashboard
 
 
-__version__ = "0.4.11"
+__version__ = "0.4.12"
 __author__ = "Abe Arab"
 __email__ = 'abea@arcinstitute.org' # "abarbiology@gmail.com"
diff --git a/screenpro/assays/__init__.py b/screenpro/assays/__init__.py
@@ -332,11 +332,14 @@ def buildPhenotypeData(self, run_name='auto',db_rate_col='pop_doubling', **kwarg
         untreated = self.phenotypes[run_name]['config']['untreated']
         treated = self.phenotypes[run_name]['config']['treated']
 
-        #TODO: fix `_calculateGrowthFactor` and `_getTreatmentDoublingRate`
-        growth_factor_table = self._calculateGrowthFactor(
-            untreated = untreated, treated = treated, 
-            db_rate_col = db_rate_col
-        )
+        if type(treated) != list: treated = [treated]
+
+        if db_rate_col:
+            #TODO: fix `_calculateGrowthFactor` and `_getTreatmentDoublingRate`
+            growth_factor_table = self._calculateGrowthFactor(
+                untreated = untreated, treated = treated, 
+                db_rate_col = db_rate_col
+            )
         
         pdata_list = []
 
@@ -345,9 +348,12 @@ def buildPhenotypeData(self, run_name='auto',db_rate_col='pop_doubling', **kwarg
             score_tag, comparison = phenotype_name.split(':')
             cond_test, cond_ref = comparison.split('_vs_')
 
-            growth_rate_reps=growth_factor_table.query(
-                f'score=="{score_tag}"'
-            ).set_index('replicate')['growth_factor'].to_dict()
+            if db_rate_col:
+                growth_rate_reps=growth_factor_table.query(
+                    f'score=="{score_tag}"'
+                ).set_index('replicate')['growth_factor'].to_dict()
+            else:
+                growth_rate_reps=None
             
             pdata = getPhenotypeData(
                 self.adata, score_tag=score_tag, 
@@ -385,7 +391,7 @@ def drawVolcano(
         
         score_tag, _ = phenotype_name.split(':')
 
-        df = self.phenotypes[run_name]['results'][phenotype_name]
+        df = self.phenotypes[run_name]['results'][phenotype_name].dropna()
 
         df = annotateScoreTable(
             df, 
diff --git a/screenpro/phenoscore/delta.py b/screenpro/phenoscore/delta.py
@@ -15,6 +15,8 @@
 def compareByReplicates(adata, df_cond_ref, df_cond_test, var_names='target', test='ttest', ctrl_label='negative_control', growth_rate=1, filter_type='mean', filter_threshold=40):
     """Calculate phenotype score and p-values comparing `cond_test` vs `cond_ref`.
 
+    In this function, the phenotype calculation is done by comparing multiple replicates of `cond_test` vs `cond_ref`.
+
     Args:
         adata (AnnData): AnnData object
         df_cond_ref (pd.DataFrame): dataframe of condition reference
@@ -43,8 +45,8 @@ def compareByReplicates(adata, df_cond_ref, df_cond_test, var_names='target', te
     y = df_cond_test.to_numpy()
 
     # get control values
-    x_ctrl = df_cond_ref[adat.var.targetType.eq(ctrl_label)].to_numpy()
-    y_ctrl = df_cond_test[adat.var.targetType.eq(ctrl_label)].to_numpy()
+    x_ctrl = df_cond_ref[adat.var.targetType.eq(ctrl_label)].dropna().to_numpy()
+    y_ctrl = df_cond_test[adat.var.targetType.eq(ctrl_label)].dropna().to_numpy()
 
     # calculate phenotype scores
     scores = calculateDelta(
@@ -79,6 +81,27 @@ def compareByReplicates(adata, df_cond_ref, df_cond_test, var_names='target', te
 
 
 def compareByTargetGroup(adata, df_cond_ref, df_cond_test, keep_top_n, var_names='target', test='ttest', ctrl_label='negative_control', growth_rate=1, filter_type='mean', filter_threshold=40):
+    """Calculate phenotype score and p-values comparing `cond_test` vs `cond_ref`.
+
+    In this function, the phenotype calculation is done by comparing groups of 
+    guide elements (e.g. sgRNAs) that target the same gene or groups of pseudogene (i.e.
+    subsampled groups of non-targeting control elements) between `cond_test` vs `cond_ref`.
+
+    Args:
+        adata (AnnData): AnnData object
+        df_cond_ref (pd.DataFrame): dataframe of condition reference
+        df_cond_test (pd.DataFrame): dataframe of condition test
+        keep_top_n (int): number of top guide elements to keep
+        var_names (str): variable names to use as index in the result dataframe
+        test (str): test to use for calculating p-value ('MW': Mann-Whitney U rank; 'ttest' : t-test)
+        ctrl_label (str): control label, default is 'negative_control'
+        growth_rate (int): growth rate
+        filter_type (str): filter type to apply to low counts ('mean', 'both', 'either')
+        filter_threshold (int): filter threshold for low counts (default is 40)
+        
+    Returns:
+        pd.DataFrame: result dataframe
+    """
 
     adat = adata.copy()
 
diff --git a/screenpro/plotting/_rank.py b/screenpro/plotting/_rank.py
@@ -3,7 +3,7 @@
 from ._utils import yellow_blue
 
 
-def rank_plot(df, rank_col, color_col=None, name_col='target', highlight_values_dict=None, xlabel='Rank', ylabel='Values', title='Rank Plot', ax=None, dot_size=1.5, highlight_size_factor=100, **args):
+def rank_plot(df, rank_col, color_col=None, name_col='target', highlight_values_dict=None, xlabel='Rank', ylabel='Values', title='Rank Plot', ax=None, dot_size=1.5, highlight_size_factor=100, txt_font_size=8, **args):
     """
     Plot the ranks against their values with specified color.
 
@@ -22,6 +22,7 @@ def rank_plot(df, rank_col, color_col=None, name_col='target', highlight_values_
         ax (matplotlib.axes.Axes, optional): The axis object to plot on. If not provided, a new axis will be created.
         dot_size (float, optional): The size of the dots in the scatter plot. Default is 1.5.
         highlight_size_factor (int, optional): The size factor for the highlighted dots. Default is 100.
+        txt_font_size (int, optional): The font size for the text labels. Default is 8.
         **args: Additional keyword arguments to be passed to the scatter plot.
 
     Returns:
@@ -57,7 +58,7 @@ def rank_plot(df, rank_col, color_col=None, name_col='target', highlight_values_
     
             if highlight_values['text'] is not False:
                 for i, row in highlight_ranks.iterrows():
-                    ax.text(row['Rank'] + .01, row[rank_col] + .001, row[name_col], fontsize=8, color=highlight_color, ha='right')
+                    ax.text(row['Rank'] + .01, row[rank_col] + .001, row[name_col], fontsize=txt_font_size, color=highlight_color, ha='right')
 
     # Add labels and title
     ax.set_xlabel(xlabel)