Merge pull request #95 from ArcInstitute/dev

abearab · web-flow · commit 7777e1b688b4 · 2024-09-21T21:23:46.000-07:00
minor fixes
diff --git a/docs/environment.yaml b/docs/environment.yaml
@@ -22,11 +22,13 @@ dependencies:
   - sphinx=5.3.0
   - sphinx_rtd_theme=1.1.1
   - sphinxcontrib-bibtex
+  - polars>0.20
   - pip
   - pip:
-      - polars
+      - pyarrow
       - biobear
       - numba
       - pydeseq2
       - simple_colors
+      - adjustText
       - watermark
diff --git a/environment.yml b/environment.yml
@@ -18,12 +18,13 @@ dependencies:
   - ipykernel
   - mscorefonts
   - rust>=1.72
+  - polars>0.20
   - pip
   - pip:
-      - polars
       - pyarrow
       - biobear
       - numba
       - pydeseq2
       - simple_colors
+      - adjustText
       - watermark
diff --git a/screenpro/__init__.py b/screenpro/__init__.py
@@ -31,6 +31,6 @@
 from .dashboard import DrugScreenDashboard
 
 
-__version__ = "0.4.13"
+__version__ = "0.4.14"
 __author__ = "Abe Arab"
 __email__ = 'abea@arcinstitute.org' # "abarbiology@gmail.com"
diff --git a/screenpro/phenoscore/_annotate.py b/screenpro/phenoscore/_annotate.py
@@ -21,15 +21,15 @@
 }
 
 
-def getCombinedScore(df, score_col='score', pvalue_col='pvalue', ctrl_label='negative_control'):
+def getCombinedScore(df_in, score_col='score', pvalue_col='pvalue', target_col='target', ctrl_label='negative_control'):
     """
     Calculate the combined score column based on the given phenotypic scores and p-values.
     Combined score is calculated as:
 
         $combined\_score = \frac{score}{pseudo\_sd} \times -\log_{10}(pvalue)$
 
     Parameters:
-        df (pandas.DataFrame): The input DataFrame.
+        df_in (pandas.DataFrame): The input DataFrame.
         score_col (str): The column name for the individual scores. Default is 'score'.
         pvalue_col (str): The column name for the p-values. Default is 'pvalue'.
         target_col (str): The column name for the target variable. Default is 'target'.
@@ -39,18 +39,22 @@ def getCombinedScore(df, score_col='score', pvalue_col='pvalue', ctrl_label='neg
     Returns:
         pandas.Series: The calculated combined score column.
     """
-    if 'target' not in df.columns:
-        raise ValueError('Column "target" not found in the input DataFrame.')
+    # make a copy of input dataframe
+    df = df_in.copy()
+
+    for col in [score_col, pvalue_col, target_col]:
+        if col not in df.columns:
+            raise ValueError(f'Column "{col}" not found in the input DataFrame.')
     
     # calculate pseudo_sd
-    pseudo_sd = df[df['target'].eq(ctrl_label)][score_col].tolist()
+    pseudo_sd = df[df[target_col].eq(ctrl_label)][score_col].tolist()
     pseudo_sd = np.std(pseudo_sd)
 
     # calculate combined score
     return df[score_col]/pseudo_sd * -np.log10(df[pvalue_col])
 
 
-def annotateScoreTable(df_in, up_hit, down_hit, threshold, score_col=None, pvalue_col=None, ctrl_label='negative_control'):
+def annotateScoreTable(df_in, up_hit, down_hit, threshold, score_col='score', pvalue_col='pvalue', target_col='target', ctrl_label='negative_control'):
     """
     Annotate the given score tabel 
     
@@ -60,49 +64,47 @@ def annotateScoreTable(df_in, up_hit, down_hit, threshold, score_col=None, pvalu
         up_hit (str): up hit label
         down_hit (str): down hit label
         threshold (int): threshold value
-        score_col (str): score column name
-        pvalue_col (str): pvalue column name
-        ctrl_label (str): control label value
+        score_col (str): score column name. Default is 'score'.
+        target_col (str): column name for the target variable. Default is 'target'.
+        pvalue_col (str): pvalue column name. Default is 'pvalue'.
+        ctrl_label (str): control label value. Default is 'negative_control'.
     
     Returns:
         pd.DataFrame: annotated score dataframe
     """
-    if score_col is None: score_col = 'score'
-    if pvalue_col is None: pvalue_col = 'pvalue'
+    # make a copy of input dataframe
+    df = df_in.copy()
 
-    sel = ['target',score_col, pvalue_col]
-    
-    for col in sel:
-        if col not in df_in.columns:
+    for col in [score_col, pvalue_col, target_col]:
+        if col not in df.columns:
             raise ValueError(f'Column "{col}" not found in the input DataFrame.')
-    
-    # make a copy of input dataframe
-    df = df_in[sel].copy()
-    # # rename/reformat columns
-    # df.columns = ['target', 'score', 'pvalue']
+
     df[score_col] = df[score_col].astype(float)
     df[pvalue_col] = df[pvalue_col].astype(float)
 
     # add combined score column
-    df['combined_score'] = getCombinedScore(df, score_col, pvalue_col, ctrl_label)
+    df['combined_score'] = getCombinedScore(
+        df,
+        score_col=score_col, pvalue_col=pvalue_col, target_col=target_col,
+        ctrl_label=ctrl_label)
 
     # add label column
     df['label'] = '.'
 
     # annotate hits: up
     df.loc[
-        (df[score_col] > 0) & (~df['target'].eq(ctrl_label)) &
+        (df[score_col] > 0) & (~df[target_col].eq(ctrl_label)) &
         (df['combined_score'] >= threshold), 'label'
     ] = up_hit
 
     # annotate hits: down
     df.loc[
-        (df[score_col] < 0) & (~df['target'].eq(ctrl_label)) &
+        (df[score_col] < 0) & (~df[target_col].eq(ctrl_label)) &
         (df['combined_score'] <= -threshold), 'label'
     ] = down_hit
 
     # annotate control
-    df.loc[df['target'].eq(ctrl_label), 'label'] = ctrl_label
+    df.loc[df[target_col].eq(ctrl_label), 'label'] = ctrl_label
 
     # annotate non-hit
     df.loc[df['label'] == '.', 'label'] = 'target_non_hit'
diff --git a/screenpro/phenoscore/delta.py b/screenpro/phenoscore/delta.py
@@ -153,10 +153,10 @@ def compareByTargetGroup(adata, df_cond_ref, df_cond_test, keep_top_n, var_names
 
     # combine results into a dataframe
     result = pd.concat([
-        pd.Series(scores, name='score'),
-        pd.Series(p_values, name=f'{test} pvalue'),
-        pd.Series(adj_p_values, name='BH adj_pvalue'),
-        pd.Series(target_sizes, name='number_of_guide_elements'),
+        pd.Series(scores, name='score', dtype=float),
+        pd.Series(p_values, name=f'{test} pvalue', dtype=float),
+        pd.Series(adj_p_values, name='BH adj_pvalue', dtype=float),
+        pd.Series(target_sizes, name='number_of_guide_elements', dtype=int),
     ], axis=1)
 
     # add targets information
diff --git a/screenpro/plotting/_rank.py b/screenpro/plotting/_rank.py
@@ -1,5 +1,7 @@
 import pandas as pd
 import matplotlib.pyplot as plt
+
+from adjustText import adjust_text
 from ._utils import yellow_blue
 
 
@@ -57,8 +59,23 @@ def rank_plot(df, rank_col, color_col=None, name_col='target', highlight_values_
             ax.plot(highlight_ranks['Rank'], highlight_ranks[rank_col], 'o', color=highlight_color, markersize=dot_size * highlight_size_factor)
     
             if highlight_values['text'] is not False:
+                texts = []
                 for i, row in highlight_ranks.iterrows():
-                    ax.text(row['Rank'] + .01, row[rank_col] + .001, row[name_col], fontsize=txt_font_size, color=highlight_color, ha='right')
+                    t = ax.text(
+                        row['Rank'] + .01, 
+                        row[rank_col] + .001, 
+                        row[name_col], 
+                        fontsize=txt_font_size, 
+                        color=highlight_color, 
+                        ha='right'
+                    )
+                    texts.append(t)
+
+                adjust_text(
+                    texts, 
+                    arrowprops=dict(arrowstyle='-', color=highlight_color, lw=0.5),
+                    ax=ax
+                )
 
     # Add labels and title
     ax.set_xlabel(xlabel)