11import json
2+ from datetime import datetime , timedelta , timezone
23from operator import itemgetter
34from typing import List , Optional
45
1011from langchain_core .documents import Document
1112from langchain_core .language_models .chat_models import BaseChatModel
1213from langchain_core .output_parsers import StrOutputParser
14+
15+ from oci .exceptions import TransientServiceError
1316from openai .types .chat import ChatCompletionMessageParam
1417
1518from skynet .constants import response_prefix
1619
17- from skynet .env import llama_n_ctx , modules , use_oci
20+ from skynet .env import llama_n_ctx , modules , oci_blackout_fallback_duration , use_oci
1821from skynet .logs import get_logger
1922from skynet .modules .monitoring import MAP_REDUCE_CHUNKING_COUNTER
2023from skynet .modules .ttt .assistant .constants import assistant_rag_question_extractor
4447
4548log = get_logger (__name__ )
4649
50+ # Global OCI blackout state management
51+ _oci_blackout_until : Optional [datetime ] = None
52+
53+
54+ def set_oci_blackout (duration_seconds : int ) -> None :
55+ """Set OCI blackout for the specified duration."""
56+ global _oci_blackout_until
57+ _oci_blackout_until = datetime .now (timezone .utc ) + timedelta (seconds = duration_seconds )
58+ log .warning (f"OCI blackout set until { _oci_blackout_until } ({ duration_seconds } seconds)" )
59+
60+
61+ def is_oci_blackout_active () -> bool :
62+ """Check if OCI is currently in blackout period."""
63+ global _oci_blackout_until
64+ if _oci_blackout_until is None :
65+ return False
66+
67+ now = datetime .now (timezone .utc )
68+ if now >= _oci_blackout_until :
69+ _oci_blackout_until = None # Clear expired blackout
70+ log .info ("OCI blackout period expired, resuming normal processing" )
71+ return False
72+
73+ return True
74+
4775
4876hint_type_to_prompt = {
4977 JobType .SUMMARY : {
@@ -201,7 +229,12 @@ async def process(job: Job) -> str:
201229 job_type = job .type
202230 customer_id = job .metadata .customer_id
203231
204- llm = LLMSelector .select (customer_id , job_id = job .id , ** {'max_completion_tokens' : payload .max_completion_tokens })
232+ llm = LLMSelector .select (
233+ customer_id ,
234+ job_id = job .id ,
235+ oci_blackout = is_oci_blackout_active (),
236+ ** {'max_completion_tokens' : payload .max_completion_tokens },
237+ )
205238
206239 try :
207240 if job_type == JobType .ASSIST :
@@ -212,6 +245,18 @@ async def process(job: Job) -> str:
212245 result = await process_text (llm , payload )
213246 else :
214247 raise ValueError (f'Invalid job type { job_type } ' )
248+ except TransientServiceError as e :
249+ log .warning (f"Job { job .id } hit TransientServiceError: { e } " )
250+
251+ # Set blackout using fallback duration
252+ blackout_duration = oci_blackout_fallback_duration
253+ log .info (f"TransientServiceError detected, setting { blackout_duration } s blackout" )
254+ set_oci_blackout (blackout_duration )
255+
256+ # Switch current job to local processing
257+ LLMSelector .override_job_processor (job .id , Processors .LOCAL )
258+ return await process (job )
259+
215260 except Exception as e :
216261 log .warning (f"Job { job .id } failed: { e } " )
217262
0 commit comments