|
| 1 | +import logging |
| 2 | +import re |
| 3 | +from typing import Any, Dict |
| 4 | +from urllib.parse import urlparse |
| 5 | +from langchain_core.messages import HumanMessage |
| 6 | +from langchain_google_genai import ChatGoogleGenerativeAI |
1 | 7 |
|
| 8 | +from app.core.config import settings |
| 9 | +from app.database.weaviate.operations import search_contributors |
| 10 | +from app.services.github.issue_processor import GitHubIssueProcessor |
| 11 | +from app.services.embedding_service.service import EmbeddingService |
| 12 | +from ..prompts.contributor_recommendation.query_alignment import QUERY_ALIGNMENT_PROMPT |
| 13 | + |
| 14 | +logger = logging.getLogger(__name__) |
| 15 | + |
| 16 | +class ContributorRecommendationWorkflow: |
| 17 | + """ |
| 18 | + Contributor recommendation with proper query alignment for hybrid search. |
| 19 | + """ |
| 20 | + |
| 21 | + def __init__(self): |
| 22 | + self.query_alignment_llm = ChatGoogleGenerativeAI( |
| 23 | + model=settings.github_agent_model, |
| 24 | + temperature=0.1, |
| 25 | + google_api_key=settings.gemini_api_key |
| 26 | + ) |
| 27 | + self.embedding_service = EmbeddingService() |
| 28 | + |
| 29 | + async def _align_user_request(self, query: str) -> Dict[str, Any]: |
| 30 | + """ |
| 31 | + Align user request into optimized format for hybrid search. |
| 32 | + Extract clean technical query + keywords that match contributor profiles. |
| 33 | + """ |
| 34 | + logger.info("Aligning user request for hybrid search optimization") |
| 35 | + |
| 36 | + url_match = re.search(r'https?://github\.com/[\w-]+/[\w.-]+/issues/\d+', query) |
| 37 | + |
| 38 | + if url_match: |
| 39 | + issue_content = await self._fetch_github_issue_content(url_match.group(0)) |
| 40 | + full_query = f"{query}\n\nIssue content: {issue_content}" |
| 41 | + else: |
| 42 | + full_query = query |
| 43 | + |
| 44 | + prompt = QUERY_ALIGNMENT_PROMPT.format(query=full_query) |
| 45 | + response = await self.query_alignment_llm.ainvoke([HumanMessage(content=prompt)]) |
| 46 | + |
| 47 | + try: |
| 48 | + import json |
| 49 | + print(response) |
| 50 | + result = json.loads(response.content.strip()) |
| 51 | + logger.info(f"Query aligned: '{result.get('aligned_query')}' with keywords: {result.get('keywords')}") |
| 52 | + return result |
| 53 | + except json.JSONDecodeError: |
| 54 | + logger.warning("Failed to parse alignment result, using fallback") |
| 55 | + return { |
| 56 | + "query_type": "general", |
| 57 | + "aligned_query": query, |
| 58 | + "keywords": [], |
| 59 | + "technical_domain": "other" |
| 60 | + } |
| 61 | + |
| 62 | + async def _fetch_github_issue_content(self, github_url: str) -> str: |
| 63 | + """Fetch GitHub issue content.""" |
| 64 | + try: |
| 65 | + parsed_url = urlparse(github_url) |
| 66 | + path_parts = parsed_url.path.strip('/').split('/') |
| 67 | + |
| 68 | + if len(path_parts) >= 4 and path_parts[2] == "issues": |
| 69 | + owner, repo, issue_number = path_parts[0], path_parts[1], int(path_parts[3]) |
| 70 | + processor = GitHubIssueProcessor(owner, repo, issue_number) |
| 71 | + |
| 72 | + content = await processor._fetch_issue_content() |
| 73 | + return content |
| 74 | + else: |
| 75 | + raise ValueError("Invalid GitHub issue URL") |
| 76 | + |
| 77 | + except Exception as e: |
| 78 | + logger.error(f"GitHub issue fetching failed: {e}") |
| 79 | + raise |
| 80 | + |
| 81 | +async def handle_contributor_recommendation(query: str) -> Dict[str, Any]: |
| 82 | + """ |
| 83 | + Main entry point with unified query processing. |
| 84 | + """ |
| 85 | + logger.info(f"Processing contributor recommendation: {query[:100]}...") |
| 86 | + |
| 87 | + try: |
| 88 | + workflow = ContributorRecommendationWorkflow() |
| 89 | + |
| 90 | + alignment_result = await workflow._align_user_request(query) |
| 91 | + search_text = alignment_result.get("aligned_query", query) |
| 92 | + |
| 93 | + logger.info("Generating embedding for semantic search") |
| 94 | + enhanced_search_text = f"Looking for contributor with expertise in: {search_text}" |
| 95 | + query_embedding = await workflow.embedding_service.get_embedding(enhanced_search_text) |
| 96 | + logger.info(f"Generated embedding with dimension: {len(query_embedding)}") |
| 97 | + |
| 98 | + logger.info("Performing hybrid search (semantic + keyword matching)") |
| 99 | + |
| 100 | + results = await search_contributors( |
| 101 | + query_embedding=query_embedding, |
| 102 | + keywords=alignment_result.get("keywords", []), |
| 103 | + limit=5, |
| 104 | + vector_weight=0.7, # Semantic similarity |
| 105 | + bm25_weight=0.3 # Keyword matching |
| 106 | + ) |
| 107 | + |
| 108 | + logger.info(f"Search complete: Found {len(results)} potential contributors") |
| 109 | + |
| 110 | + if not results: |
| 111 | + logger.info("No contributors found matching the search criteria") |
| 112 | + return { |
| 113 | + "status": "success", |
| 114 | + "recommendations": [], |
| 115 | + "message": "No suitable contributors found", |
| 116 | + "search_query": search_text, |
| 117 | + "keywords_used": alignment_result.get("keywords", []), |
| 118 | + "technical_domain": alignment_result.get("technical_domain", "other") |
| 119 | + } |
| 120 | + |
| 121 | + logger.info("Formatting recommendations with scores") |
| 122 | + recommendations = [] |
| 123 | + for contributor in results: |
| 124 | + languages = contributor.get('languages', []) |
| 125 | + topics = contributor.get('topics', []) |
| 126 | + hybrid_score = contributor.get('hybrid_score', 0) |
| 127 | + vector_score = contributor.get('vector_score', 0) |
| 128 | + bm25_score = contributor.get('bm25_score', 0) |
| 129 | + |
| 130 | + reason_parts = [] |
| 131 | + if languages: |
| 132 | + reason_parts.append(f"Expert in {', '.join(languages)}") |
| 133 | + if topics: |
| 134 | + reason_parts.append(f"Active in {', '.join(topics)}") |
| 135 | + |
| 136 | + username = contributor.get("github_username") |
| 137 | + recommendation = { |
| 138 | + "user": username, |
| 139 | + "reason": " • ".join(reason_parts) if reason_parts else "Strong technical match", |
| 140 | + "search_score": round(hybrid_score, 4), |
| 141 | + "vector_score": round(vector_score, 4), |
| 142 | + "keyword_score": round(bm25_score, 4), |
| 143 | + "languages": languages, |
| 144 | + "topics": topics |
| 145 | + } |
| 146 | + |
| 147 | + recommendations.append(recommendation) |
| 148 | + logger.info( |
| 149 | + f"@{username} (score: {hybrid_score:.4f}) - {reason_parts[0] if reason_parts else 'Technical match'}") |
| 150 | + |
| 151 | + logger.info(f"Successfully generated {len(recommendations)} contributor recommendations") |
| 152 | + |
| 153 | + return { |
| 154 | + "status": "success", |
| 155 | + "recommendations": recommendations, |
| 156 | + "message": f"Found {len(recommendations)} suitable contributors", |
| 157 | + "search_query": search_text, |
| 158 | + "keywords_used": alignment_result.get("keywords", []), |
| 159 | + "technical_domain": alignment_result.get("technical_domain", "other"), |
| 160 | + "search_metadata": { |
| 161 | + "total_candidates": len(results), |
| 162 | + "vector_weight": 0.7, |
| 163 | + "keyword_weight": 0.3, |
| 164 | + "embedding_dimension": len(query_embedding) |
| 165 | + } |
| 166 | + } |
| 167 | + |
| 168 | + except Exception as e: |
| 169 | + logger.error(f"Error in contributor recommendation: {str(e)}", exc_info=True) |
| 170 | + return {"status": "error", "message": str(e)} |
0 commit comments