feat(3.2): Add retry mechanism and validation to INSPIRE fetcher

QiyuanHananu · QiyuanHananu · commit 9c6d882be2ce · 2025-10-09T13:42:08.000+11:00
- Implement retry mechanism with 3 attempts and exponential backoff - Add timeout configuration (10s connect, 30s read) - Add comprehensive logging (INFO/WARN/DEBUG levels) - Add citation key validation to detect URL-like keys - Improve error messages with actionable suggestions - Add test guide for validation Part of issue #12292 - Task 3.2: INSPIRE API optimization Provides stable data source for other team members' tasks
diff --git a/jablib/src/main/java/org/jabref/logic/importer/fetcher/INSPIREFetcher.java b/jablib/src/main/java/org/jabref/logic/importer/fetcher/INSPIREFetcher.java
@@ -1,5 +1,6 @@
 package org.jabref.logic.importer.fetcher;
 
+import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.net.URISyntaxException;
@@ -29,15 +30,41 @@
 
 import org.apache.hc.core5.net.URIBuilder;
 import org.jspecify.annotations.NonNull;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Fetches data from the INSPIRE database.
+ * 
+ * Enhanced version with:
+ * - Retry mechanism for network failures
+ * - Better error handling and logging
+ * - Validation of fetched data
+ * - Optimized request headers
  */
 public class INSPIREFetcher implements SearchBasedParserFetcher, EntryBasedFetcher {
 
+    private static final Logger LOGGER = LoggerFactory.getLogger(INSPIREFetcher.class);
+    
     private static final String INSPIRE_HOST = "https://inspirehep.net/api/literature/";
     private static final String INSPIRE_DOI_HOST = "https://inspirehep.net/api/doi/";
     private static final String INSPIRE_ARXIV_HOST = "https://inspirehep.net/api/arxiv/";
+    
+    // Retry configuration
+    private static final int MAX_RETRIES = 3;
+    private static final long RETRY_DELAY_MS = 1000; // 1 second base delay
+    
+    // Timeout configuration (in milliseconds)
+    private static final int CONNECT_TIMEOUT_MS = 10000; // 10 seconds
+    private static final int READ_TIMEOUT_MS = 30000;    // 30 seconds
+    
+    private static final String ERROR_MESSAGE_TEMPLATE = 
+        "Failed to fetch from INSPIRE using %s after %d attempts.\n" +
+        "Possible causes:\n" +
+        "- Network connection issue\n" +
+        "- INSPIRE service temporarily unavailable\n" +
+        "- Invalid identifier format\n" +
+        "Please check your internet connection and try again.";
 
     private final ImportFormatPreferences importFormatPreferences;
 
@@ -65,7 +92,15 @@ public URL getURLForQuery(BaseQueryNode queryNode) throws URISyntaxException, Ma
     @Override
     public URLDownload getUrlDownload(URL url) {
         URLDownload download = new URLDownload(url);
+        
+        // Set comprehensive headers
         download.addHeader("Accept", MediaTypes.APPLICATION_BIBTEX);
+        download.addHeader("User-Agent", "JabRef/" + getClass().getPackage().getImplementationVersion());
+        
+        // Set timeouts to prevent hanging
+        download.setConnectTimeout(CONNECT_TIMEOUT_MS);
+        download.setReadTimeout(READ_TIMEOUT_MS);
+        
         return download;
     }
 
@@ -78,6 +113,13 @@ public void doPostCleanup(BibEntry entry) {
         new FieldFormatterCleanup(StandardField.TITLE, new RemoveEnclosingBracesFormatter()).cleanup(entry);
 
         new FieldFormatterCleanup(StandardField.TITLE, new LatexToUnicodeFormatter()).cleanup(entry);
+        
+        // Log the citation key for debugging
+        if (LOGGER.isDebugEnabled()) {
+            entry.getCitationKey().ifPresent(key -> 
+                LOGGER.debug("Post-cleanup citation key: {}", key)
+            );
+        }
     }
 
     @Override
@@ -92,28 +134,156 @@ public List<BibEntry> performSearch(@NonNull BibEntry entry) throws FetcherExcep
         Optional<String> eprint = entry.getField(StandardField.EPRINT);
 
         String urlString;
-        if (archiveprefix.filter("arxiv"::equals).isPresent() && eprint.isPresent()) {
+        String identifier;
+        
+        // Prioritize arXiv (INSPIRE has best support for arXiv identifiers)
+        if (archiveprefix.filter("arxiv"::equalsIgnoreCase).isPresent() && eprint.isPresent()) {
             urlString = INSPIRE_ARXIV_HOST + eprint.get();
+            identifier = "arXiv:" + eprint.get();
+            LOGGER.debug("Using INSPIRE arXiv endpoint for: {}", identifier);
         } else if (doi.isPresent()) {
             urlString = INSPIRE_DOI_HOST + doi.get();
+            identifier = "DOI:" + doi.get();
+            LOGGER.debug("Using INSPIRE DOI endpoint for: {}", identifier);
         } else {
+            LOGGER.debug("No suitable identifier found for INSPIRE search");
             return List.of();
         }
 
         URL url;
         try {
             url = new URI(urlString).toURL();
         } catch (MalformedURLException | URISyntaxException e) {
-            throw new FetcherException("Invalid URL", e);
+            throw new FetcherException("Invalid INSPIRE URL: " + urlString, e);
         }
 
-        try {
-            URLDownload download = getUrlDownload(url);
-            List<BibEntry> results = getParser().parseEntries(download.asInputStream());
-            results.forEach(this::doPostCleanup);
-            return results;
-        } catch (ParseException e) {
-            throw new FetcherException(url, e);
+        // Use retry mechanism for robust fetching
+        List<BibEntry> results = performSearchWithRetry(url, identifier);
+        
+        // Validate and log results
+        validateResults(results, identifier);
+        
+        return results;
+    }
+    
+    /**
+     * Performs the search with automatic retry on failure.
+     * Implements exponential backoff for retries.
+     * 
+     * @param url The URL to fetch from
+     * @param identifier Human-readable identifier for logging
+     * @return List of fetched BibEntry objects
+     * @throws FetcherException if all retry attempts fail
+     */
+    private List<BibEntry> performSearchWithRetry(URL url, String identifier) throws FetcherException {
+        int attempt = 0;
+        FetcherException lastException = null;
+        
+        while (attempt < MAX_RETRIES) {
+            try {
+                LOGGER.info("Fetching from INSPIRE (attempt {}/{}): {} [{}]", 
+                           attempt + 1, MAX_RETRIES, url, identifier);
+                
+                URLDownload download = getUrlDownload(url);
+                List<BibEntry> results = getParser().parseEntries(download.asInputStream());
+                
+                // Log success
+                if (results.isEmpty()) {
+                    LOGGER.warn("INSPIRE returned empty results for: {} [{}]", url, identifier);
+                } else {
+                    LOGGER.info("Successfully fetched {} entries from INSPIRE for [{}]", 
+                               results.size(), identifier);
+                }
+                
+                // Apply post-processing
+                results.forEach(this::doPostCleanup);
+                return results;
+                
+            } catch (ParseException | IOException e) {
+                lastException = new FetcherException(url, 
+                    "Failed to fetch from INSPIRE (attempt " + (attempt + 1) + "): " + e.getMessage(), e);
+                
+                LOGGER.warn("Fetch attempt {} failed for [{}]: {}", 
+                           attempt + 1, identifier, e.getMessage());
+                
+                attempt++;
+                
+                // Implement exponential backoff for retries
+                if (attempt < MAX_RETRIES) {
+                    long delay = RETRY_DELAY_MS * (long) Math.pow(2, attempt - 1);
+                    LOGGER.info("Retrying in {} ms...", delay);
+                    
+                    try {
+                        Thread.sleep(delay);
+                    } catch (InterruptedException ie) {
+                        Thread.currentThread().interrupt();
+                        throw new FetcherException("Interrupted during retry for [" + identifier + "]", ie);
+                    }
+                }
+            }
+        }
+        
+        // All retries failed
+        throw new FetcherException(
+            String.format(ERROR_MESSAGE_TEMPLATE, identifier, MAX_RETRIES), 
+            lastException
+        );
+    }
+    
+    /**
+     * Validates the fetched results and logs warnings for potential issues.
+     * This helps identify when INSPIRE returns data but without proper texkeys.
+     * 
+     * @param results The list of fetched entries
+     * @param identifier The identifier used for fetching
+     */
+    private void validateResults(List<BibEntry> results, String identifier) {
+        if (results.isEmpty()) {
+            return;
+        }
+        
+        for (BibEntry entry : results) {
+            // Check for citation key
+            if (!entry.hasCitationKey()) {
+                LOGGER.warn("Entry from INSPIRE [{}] has no citation key - may need fallback generation", 
+                           identifier);
+            } else {
+                String citationKey = entry.getCitationKey().orElse("");
+                
+                // Check for problematic citation keys (URLs, DOIs, etc.)
+                if (citationKey.startsWith("http") || 
+                    citationKey.startsWith("https") ||
+                    citationKey.startsWith("doi:") ||
+                    citationKey.contains("://")) {
+                    
+                    LOGGER.warn("Entry has URL-like citation key: '{}' [{}] - cleanup may be needed", 
+                               citationKey, identifier);
+                } else if (citationKey.length() > 100) {
+                    LOGGER.warn("Entry has unusually long citation key ({} chars) [{}] - cleanup may be needed", 
+                               citationKey.length(), identifier);
+                } else {
+                    LOGGER.info("Got valid citation key: '{}' [{}]", citationKey, identifier);
+                }
+            }
+            
+            // Check for required fields
+            if (entry.getField(StandardField.TITLE).isEmpty()) {
+                LOGGER.warn("Entry from INSPIRE [{}] has no title", identifier);
+            }
+            
+            if (entry.getField(StandardField.AUTHOR).isEmpty()) {
+                LOGGER.warn("Entry from INSPIRE [{}] has no author", identifier);
+            }
+            
+            // Log whether journal information is present (helps verify we got published version)
+            boolean hasJournalInfo = entry.getField(StandardField.JOURNAL).isPresent() ||
+                                    entry.getField(StandardField.JOURNALTITLE).isPresent();
+            if (hasJournalInfo) {
+                LOGGER.debug("Entry [{}] includes journal publication info", identifier);
+            } else {
+                LOGGER.debug("Entry [{}] has no journal info (may be preprint only)", identifier);
+            }
         }
     }
 }
+
diff --git a/test_inspire_retry.md b/test_inspire_retry.md
@@ -0,0 +1,118 @@
+# INSPIRE Fetcher 测试指南
+
+## 测试 Issue #12292 - 3.2 部分（重试机制和验证）
+
+### 测试用例 1：正常 arXiv 搜索
+**目标**：验证能成功获取数据
+```
+输入：arXiv 编号 2409.15408
+期望：
+- 成功获取数据
+- 有日志输出：INFO "Fetching from INSPIRE..."
+- 有日志输出：INFO "Successfully fetched X entries"
+```
+
+### 测试用例 2：网络重试
+**目标**：验证重试机制
+```
+模拟：网络超时
+期望：
+- 自动重试 3 次
+- 日志显示：WARN "Fetch attempt 1 failed"
+- 日志显示：INFO "Retrying in 1000 ms..."
+- 日志显示：INFO "Retrying in 2000 ms..."
+```
+
+### 测试用例 3：结果验证
+**目标**：验证 citation key 质量检查
+```
+场景 A：返回正常 citation key
+期望：INFO "Got valid citation key: 'Author:2024abc'"
+
+场景 B：返回 URL 形式的 key
+期望：WARN "Entry has URL-like citation key: 'https://...'"
+
+场景 C：返回超长 key
+期望：WARN "Entry has unusually long citation key (150 chars)"
+```
+
+### 测试用例 4：日志完整性
+**期望的日志输出**：
+```
+[DEBUG] Using INSPIRE arXiv endpoint for: arXiv:2409.15408
+[INFO]  Fetching from INSPIRE (attempt 1/3): https://... [arXiv:2409.15408]
+[INFO]  Successfully fetched 1 entries from INSPIRE for [arXiv:2409.15408]
+[DEBUG] Post-cleanup citation key: Author:2024abc
+[INFO]  Got valid citation key: 'Author:2024abc' [arXiv:2409.15408]
+[DEBUG] Entry [arXiv:2409.15408] includes journal publication info
+```
+
+## 如何测试
+
+### 方法 1：IntelliJ 中运行
+1. 在 IntelliJ 中打开项目
+2. 运行 JabRef
+3. 打开 Web Search，选择 INSPIRE
+4. 搜索 arXiv 编号
+5. 查看控制台日志
+
+### 方法 2：查看日志文件
+JabRef 的日志文件通常在：
+- macOS: `~/Library/Logs/JabRef/`
+- Windows: `%APPDATA%\JabRef\logs\`
+- Linux: `~/.local/share/JabRef/logs/`
+
+### 方法 3：单元测试
+```java
+@Test
+void testArxivFetchWithRetry() throws Exception {
+    BibEntry entry = new BibEntry();
+    entry.setField(StandardField.ARCHIVEPREFIX, "arXiv");
+    entry.setField(StandardField.EPRINT, "2409.15408");
+    
+    List<BibEntry> results = fetcher.performSearch(entry);
+    
+    assertFalse(results.isEmpty());
+    assertTrue(results.get(0).hasCitationKey());
+    
+    // 检查 citation key 不是 URL
+    String citationKey = results.get(0).getCitationKey().orElse("");
+    assertFalse(citationKey.startsWith("http"));
+    assertTrue(citationKey.length() < 100);
+}
+```
+
+## 与团队协作测试
+
+### 与同学 A（Core Logic）协作：
+```
+你的输出 → 同学A的输入
+验证数据质量 → 提取 texkeys
+```
+
+### 与同学 C（Routing）协作：
+```
+同学C路由到INSPIRE → 你的重试机制生效
+```
+
+### 与同学 D（Cleanup）协作：
+```
+你标记不良键 → 同学D清理
+```
+
+## 成功标准
+
+✅ 重试机制工作正常（3次重试，指数退避）
+✅ 超时控制生效（不会卡死）
+✅ 日志输出完整（INFO/WARN/DEBUG）
+✅ 能识别不良 citation key（URL、超长）
+✅ 错误提示友好
+
+## 问题排查
+
+如果测试失败：
+1. 检查网络连接
+2. 查看日志文件
+3. 验证 INSPIRE API 是否可访问
+4. 确认 arXiv 编号格式正确
+