11from aw_core import Event
2- from typing import Literal
2+ from typing import Literal , TypeAlias
3+ from datetime import date , datetime , timedelta , timezone
34
4- from .heartrate import load_heartrate_daily_df
5- from .screentime import load_category_df
5+ import pandas as pd
66
7- Sources = Literal ["activitywatch" , "heartrate" ]
7+ from ..load .location import load_daily_df as load_location_daily_df
8+ from ..load .qslang import load_daily_df as load_drugs_df
9+
10+ from .heartrate import load_heartrate_summary_df
11+ from .screentime import load_screentime_cached , load_category_df
12+ from .sleep import load_sleep_df
13+
14+ Sources = Literal ["screentime" , "heartrate" , "drugs" , "location" ]
15+
16+
17+ def load_all_df (
18+ fast = True , screentime_events : list [Event ] | None = None , ignore : list [Sources ] = []
19+ ) -> pd .DataFrame :
20+ """
21+ Loads a bunch of data into a single dataframe with one row per day.
22+ Serves as a useful starting point for further analysis.
23+ """
24+ df = pd .DataFrame ()
25+ since = datetime .now (tz = timezone .utc ) - timedelta (days = 30 if fast else 2 * 365 )
26+
27+ if "screentime" not in ignore :
28+ print ("Adding screentime" )
29+ if screentime_events is None :
30+ screentime_events = load_screentime_cached (fast = fast , since = since )
31+ df_time = load_category_df (screentime_events )
32+ df_time = df_time [["Work" , "Media" , "ActivityWatch" ]]
33+ df = join (df , df_time .add_prefix ("time:" ))
834
9- def load_all_df (events : list [Event ], ignore : list [Sources ] = []):
10- df = load_category_df (events )
1135 if "heartrate" not in ignore :
12- df = df .join (load_heartrate_daily_df (events ))
36+ print ("Adding heartrate" )
37+ df_hr = load_heartrate_summary_df (freq = "D" )
38+ # translate daily datetime column to a date column
39+ df_hr .index = df_hr .index .date # type: ignore
40+ df = join (df , df_hr )
41+
42+ if "drugs" not in ignore :
43+ print ("Adding drugs" )
44+ # keep only columns starting with "tag"
45+ df_drugs = load_drugs_df ()
46+ df_drugs = df_drugs [df_drugs .columns [df_drugs .columns .str .startswith ("tag" )]]
47+ df = join (df , df_drugs )
48+
49+ if "location" not in ignore :
50+ print ("Adding location" )
51+ # TODO: add boolean for if sleeping together
52+ df_location = load_location_daily_df ()
53+ df_location .index = df_location .index .date # type: ignore
54+ df = join (df , df_location .add_prefix ("loc:" ))
55+
56+ if "sleep" not in ignore :
57+ df_sleep = load_sleep_df ()
58+ df = join (df , df_sleep .add_prefix ("sleep:" ))
59+
60+ # look for all-na columns, emit a warning, and drop them
61+ na_cols = df .columns [df .isna ().all ()]
62+ if len (na_cols ) > 0 :
63+ print (f"Warning: dropping all-NA columns: { str (list (na_cols ))} " )
64+ df = df .drop (columns = na_cols )
65+
1366 return df
67+
68+
69+
70+ def join (df_target : pd .DataFrame , df_source : pd .DataFrame ) -> pd .DataFrame :
71+ if not df_target .empty :
72+ check_new_data_in_range (df_source , df_target )
73+ print (f"Adding new columns: { str (list (df_source .columns .difference (df_target .columns )))} " )
74+ return df_target .join (df_source ) if not df_target .empty else df_source
75+
76+
77+ DateLike : TypeAlias = datetime | date | pd .Timestamp
78+
79+
80+ def datelike_to_date (d : DateLike ) -> date :
81+ if isinstance (d , datetime ) or isinstance (d , pd .Timestamp ):
82+ return d .date ()
83+ elif isinstance (d , date ):
84+ return d
85+ else :
86+ raise ValueError (f"Invalid type for datelike: { type (d )} " )
87+
88+
89+ def check_new_data_in_range (df_source : pd .DataFrame , df_target : pd .DataFrame ) -> None :
90+ # check that source data covers target data, or emit warning
91+ source_start = datelike_to_date (df_source .index .min ())
92+ source_end = datelike_to_date (df_source .index .max ())
93+ target_start = datelike_to_date (df_target .index .min ())
94+ target_end = datelike_to_date (df_target .index .max ())
95+
96+ # check the worst case
97+ if source_start > target_end or source_end < target_start :
98+ print (
99+ f"Warning: source data does not cover ANY of target data: ({ source_start } /{ source_end } ) not in ({ target_start } /{ target_end } )"
100+ )
101+ elif source_start > target_start :
102+ print (
103+ f"Warning: source data starts after target data (partial): { source_start } > { target_start } "
104+ )
105+ elif source_end < target_end :
106+ print (
107+ f"Warning: source data ends before target data (partial): { source_end } < { target_end } "
108+ )
109+
110+
111+ if __name__ == "__main__" :
112+ import os
113+ import logging
114+ logging .basicConfig (level = logging .INFO )
115+
116+ # print a summary of all data
117+ df = load_all_df (fast = os .environ .get ("FAST" , "1" ) == "1" )
118+ print (df )
119+ print (df .describe ())
120+
121+ # check for missing data
122+ df_days_na = df .isna ().sum ()
123+ df_days_na = df_days_na [df_days_na > 0 ]
124+ if len (df_days_na ) > 0 :
125+ print (f"Missing data for { len (df_days_na )} out of { len (df .columns )} columns" )
126+ print (df_days_na )
127+ print ("Total days: " , len (df ))
128+
129+ # keep days with full coverage
130+ df = df .dropna ()
131+ print ("Total days with full coverage: " , len (df ))
132+
133+ print ("Final dataframe:" )
134+ print (df )
0 commit comments