Release/2.56.2 (#648)

brenopapa · web-flow · commit 95bc0f038aea · 2025-10-29T08:18:43.000-03:00
Fix fallback when SSO is enabled and other auth methods are used
Add multithreading support for BigQuery Storage
Introduce in-memory mode for PyCarol
diff --git a/pycarol/__init__.py b/pycarol/__init__.py
@@ -79,3 +79,4 @@
 from .storage import Storage  # noqa
 from .subscription import Subscription  # noqa
 from .tasks import Tasks  # noqa
+from .memory import Memory  # noqa
diff --git a/pycarol/bigquery.py b/pycarol/bigquery.py
@@ -14,6 +14,8 @@
 from google.api_core import retry as retries
 from google.auth.transport.requests import Request
 
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
 try:
     import pandas
 except ImportError:
@@ -407,14 +409,15 @@ def _get_read_session(
         self,
         client: bigquery_storage.BigQueryReadClient,
         table_name: str,
-        columns_names: T.Optional[T.List[str]] = None,
+        column_names: T.Optional[T.List[str]] = None,
         row_restriction: T.Optional[str] = None,
         sample_percentage: T.Optional[float] = None,
+        max_stream_count: T.Optional[int] = 1,
     ) -> bigquery_storage_v1.types.ReadSession:
         read_options = None
-        if columns_names is not None:
+        if column_names is not None:
             read_options = types.ReadSession.TableReadOptions(  # type:ignore # noqa:E501 pylint:disable=no-member
-                selected_fields=columns_names,
+                selected_fields=column_names,
                 row_restriction=row_restriction,
                 sample_percentage=sample_percentage,
             )
@@ -429,17 +432,18 @@ def _get_read_session(
         read_session = client.create_read_session(
             parent=parent,
             read_session=requested_session,
-            max_stream_count=1,
+            max_stream_count=max_stream_count,
         )
         return read_session
 
     def query(
         self,
         table_name: str,
-        columns_names: T.Optional[T.List[str]] = None,
+        column_names: T.Optional[T.List[str]] = None,
         return_dataframe: bool = True,
         row_restriction: T.Optional[str] = None,
         sample_percentage: T.Optional[float] = None,
+        max_stream_count: T.Optional[int] = 1
     ) -> T.Union["pandas.DataFrame", T.List[bigquery_storage_v1.reader.ReadRowsPage]]:
         """Read from BigQuery Storage API.
 
@@ -473,24 +477,35 @@ def query(
         read_session = self._get_read_session(
             client,
             table_name,
-            columns_names,
+            column_names,
             row_restriction,
             sample_percentage,
+            max_stream_count
         )
 
-        stream = read_session.streams[0]
-        reader = client.read_rows(stream.name)
+        all_frames = []
+
+        def _read_stream(stream):
+            frames = []
+            reader = client.read_rows(stream.name)
+            for frame in reader.rows().pages:
+                frames.append(frame)
+            return frames
+        
+        with ThreadPoolExecutor(max_workers=len(read_session.streams)) as executor:
+            futures = {executor.submit(_read_stream, s): s for s in read_session.streams}
 
-        frames = []
-        for frame in reader.rows().pages:
-            frames.append(frame)
+            for future in as_completed(futures):
+                df = future.result()
+                all_frames.extend(df)
 
         if return_dataframe is False:
-            return frames
+            return all_frames
 
         if "pandas" not in sys.modules and return_dataframe is True:
             raise exceptions.PandasNotFoundException
 
-        dataframe = pandas.concat([frame.to_dataframe() for frame in frames])
+        dataframe = pandas.concat([frame.to_dataframe() for frame in all_frames])
         dataframe = dataframe.reset_index(drop=True)
         return dataframe
+
diff --git a/pycarol/carol.py b/pycarol/carol.py
@@ -329,10 +329,17 @@ def call_api(
                     return {}
                 return json.loads(response.text)
 
+            if ('Single Sign On enabled' in response.text) and isinstance(self.auth, PwdAuth):                
+                raise exceptions.InvalidToken(
+                    response.text
+                    + ". Please use PwdFluig or ApiKeyAuth to authenticate.")
+
             if (response.reason == "Unauthorized") and isinstance(self.auth, PwdAuth):
                 if response.json().get("possibleResponsibleField") in [
                     "password",
                     "userLogin",
+                    "userLogin/password",
+                    "mfaCode",
                 ]:
                     raise exceptions.InvalidToken(response.text)
                 self.auth.get_access_token()  # It will refresh token if Unauthorized
diff --git a/pycarol/exceptions.py b/pycarol/exceptions.py
@@ -100,3 +100,17 @@ def __init__(self):
             " False."
         )
         super().__init__(msg)
+
+
+class TableNotFoundError(Exception):
+
+    """Custom exception for when trying to append to a table that doesn't exist."""
+
+    pass
+
+
+class InsertOperationError(Exception):
+
+    """Custom exception for when an insert operation fails."""
+
+    pass
diff --git a/pycarol/memory.py b/pycarol/memory.py
@@ -0,0 +1,220 @@
+import duckdb
+import pandas as pd
+from typing import List, Optional, Union, Tuple
+from . import exceptions
+
+
+class Memory:
+	def __init__(self, dfs: Optional[List[Tuple[pd.DataFrame, str]]] = None, database_path: Optional[str] = None) -> None:
+		"""
+		Initialize Memory class with optional database file.
+		
+		Args:
+			dfs: List of (dataframe, table_name) tuples to cache
+			database_path: Path to database file. If None, uses in-memory database.
+		"""
+		self.database_path = database_path
+		self._is_memory_mode = database_path is None
+		
+		# Create read-write connection for data loading
+		if self._is_memory_mode:
+			self.conn = duckdb.connect(database=':memory:', read_only=False)
+		else:
+			self.conn = duckdb.connect(database=database_path, read_only=False)
+
+		if dfs:
+			self.cache_dataframes(dfs)
+
+	def cache_dataframes(self, dfs: List[Tuple[pd.DataFrame, str]]) -> None:
+		"""Cache multiple dataframes to the database.
+
+		This method takes a list of (dataframe, table_name) tuples and adds each
+		dataframe to the database with the specified table name.
+
+		Args:
+			dfs: List of tuples containing (dataframe, table_name) pairs to cache.
+
+		Example:
+			>>> memory = Memory()
+			>>> df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+			>>> df2 = pd.DataFrame({"x": [5, 6], "y": [7, 8]})
+			>>> memory.cache_dataframes([(df1, "table1"), (df2, "table2")])
+		"""
+		for df, table_name in dfs:
+			self.add(table_name, df)
+
+	def add(self, table_name: str, df: pd.DataFrame) -> None:
+		"""Add or replace a table in the database.
+
+		This method creates a new table or replaces an existing one with the
+		provided dataframe. If a table with the same name already exists, it
+		will be dropped and recreated with the new data.
+
+		Args:
+			table_name: Name of the table to create or replace.
+			df: Pandas DataFrame containing the data to store.
+
+		Example:
+			>>> memory = Memory()
+			>>> df = pd.DataFrame({"id": [1, 2], "name": ["Alice", "Bob"]})
+			>>> memory.add("users", df)
+		"""
+		self.conn.execute(f"DROP TABLE IF EXISTS {table_name}")
+		self.conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df")
+
+	def delete(self, table_name: str) -> None:
+		"""remove a table in the database.
+
+		This method removes a table from the database if the table exists. 
+
+		Args:
+			table_name: Name of the table to remove.
+
+		Example:
+			>>> memory = Memory()
+			>>> df = pd.DataFrame({"id": [1, 2], "name": ["Alice", "Bob"]})
+			>>> memory.add("users", df)
+			>>> memory.remove("users")
+		"""
+		self.conn.execute(f"DROP TABLE IF EXISTS {table_name}")
+
+	def append(self, table_name: str, df: pd.DataFrame) -> None:
+		"""Append data to an existing table in the database.
+
+		This method adds new data to an existing table. The incoming dataframe
+		will be automatically reordered to match the existing table's column
+		order to ensure compatibility. The table must already exist.
+
+		Args:
+			table_name: Name of the existing table to append data to.
+			df: Pandas DataFrame containing the data to append.
+
+		Raises:
+			exceptions.TableNotFoundError: If the specified table does not exist.
+			exceptions.InsertOperationError: If the insert operation fails.
+
+		Example:
+			>>> memory = Memory()
+			>>> # First create a table
+			>>> initial_df = pd.DataFrame({"id": [1, 2], "name": ["Alice", "Bob"]})
+			>>> memory.add("users", initial_df)
+			>>> # Then append more data
+			>>> new_df = pd.DataFrame({"id": [3, 4], "name": ["Charlie", "Diana"]})
+			>>> memory.append("users", new_df)
+		"""
+		# Check if table exists
+		try:
+			table_info = self.conn.execute(f"DESCRIBE {table_name}").fetchdf()
+		except Exception as e:
+			raise exceptions.TableNotFoundError(f"Table '{table_name}' does not exist on Memory. Error: {str(e)}")
+		
+		existing_columns = table_info['column_name'].tolist()
+		
+		# Reorder dataframe columns to match existing table
+		df_reordered = df[existing_columns]
+		
+		# Attempt to insert data
+		try:
+			self.conn.execute(f"INSERT INTO {table_name} SELECT * FROM df_reordered")
+		except Exception as e:
+			raise exceptions.InsertOperationError(f"Failed to insert data into table '{table_name}' on Memory. Error: {str(e)}")
+
+	def query(self, query: str) -> pd.DataFrame:
+		"""Execute a SELECT query on the database.
+
+		This method executes SQL SELECT queries on the database. Only SELECT
+		statements and Common Table Expressions (CTEs) are allowed for security.
+		All queries are validated to prevent SQL injection attacks.
+
+		Args:
+			query: SQL SELECT query string to execute.
+
+		Returns:
+			pd.DataFrame: Query results as a pandas DataFrame.
+
+		Raises:
+			ValueError: If the query is not a valid SELECT statement or contains
+				multiple statements with non-SELECT operations.
+
+		Example:
+			>>> memory = Memory()
+			>>> memory.add("users", df)
+			>>> result = memory.query("SELECT * FROM users WHERE id > 1")
+			>>> result = memory.query("WITH cte AS (SELECT * FROM users) SELECT * FROM cte")
+		"""
+		if not self._is_select_query(query):
+			raise ValueError("Only SELECT queries are allowed. Other operations are not permitted.")
+		return self.conn.execute(query).fetchdf()
+
+	def _is_select_query(self, query: str) -> bool:
+		"""Check if the query contains only SELECT statements, preventing SQL injection.
+
+		This private method validates that the provided query contains only SELECT
+		statements or Common Table Expressions (CTEs). It prevents SQL injection
+		by ensuring no other SQL operations (INSERT, UPDATE, DELETE, DROP, etc.)
+		are present in the query.
+
+		Args:
+			query: SQL query string to validate.
+
+		Returns:
+			bool: True if the query contains only SELECT/WITH statements,
+				False otherwise.
+		"""
+		normalized_query = query.strip()
+		
+		statements = [stmt.strip() for stmt in normalized_query.split(';') if stmt.strip()]
+		
+		for statement in statements:
+			statement_upper = statement.upper()
+			if not statement_upper.startswith(('SELECT', 'WITH')):
+				return False
+		
+		return len(statements) > 0
+
+	def is_memory_mode(self) -> bool:
+		"""Check if the database is running in memory mode.
+
+		Returns:
+			bool: True if using in-memory database, False if using file database.
+		"""
+		return self._is_memory_mode
+
+	def get_database_path(self) -> Optional[str]:
+		"""Get the database file path.
+
+		Returns:
+			Optional[str]: Path to the database file if using file mode,
+				None if using memory mode.
+		"""
+		return self.database_path
+
+	def close(self) -> None:
+		"""Close database connections.
+
+		This method properly closes all database connections. It should be
+		called when the Memory instance is no longer needed to free up
+		resources.
+		"""
+		if hasattr(self, 'conn'):
+			self.conn.close()
+
+	def __enter__(self) -> 'Memory':
+		"""Context manager entry.
+
+		Returns:
+			Memory: The Memory instance for use in a with statement.
+		"""
+		return self
+
+	def __exit__(self, exc_type: Optional[type], exc_val: Optional[Exception], exc_tb: Optional[object]) -> None:
+		"""Context manager exit.
+
+		Automatically closes database connections when exiting the with block.
+
+		Args:
+			exc_type: Exception type if an exception occurred.
+			exc_val: Exception value if an exception occurred.
+			exc_tb: Exception traceback if an exception occurred.
+		"""
+		self.close()
diff --git a/setup.py b/setup.py
@@ -24,7 +24,8 @@
     "urllib3",
     "pandas>=0.23.4,!=1.0.4",
     "numpy>=1.16.3",
-    "pip-system-certs"
+    "pip-system-certs",
+    "duckdb==1.4.0"
 ]
 
 dataframe_requires = [

Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,8 @@`
`24`	`24`	`"urllib3",`
`25`	`25`	`"pandas>=0.23.4,!=1.0.4",`
`26`	`26`	`"numpy>=1.16.3",`
`27`		`- "pip-system-certs"`
	`27`	`+ "pip-system-certs",`
	`28`	`+ "duckdb==1.4.0"`
`28`	`29`	`]`
`29`	`30`
`30`	`31`	`dataframe_requires = [`