Skip to content

Commit 9c6d882

Browse files
committed
feat(3.2): Add retry mechanism and validation to INSPIRE fetcher
- Implement retry mechanism with 3 attempts and exponential backoff - Add timeout configuration (10s connect, 30s read) - Add comprehensive logging (INFO/WARN/DEBUG levels) - Add citation key validation to detect URL-like keys - Improve error messages with actionable suggestions - Add test guide for validation Part of issue #12292 - Task 3.2: INSPIRE API optimization Provides stable data source for other team members' tasks
1 parent a6f972b commit 9c6d882

File tree

2 files changed

+297
-9
lines changed

2 files changed

+297
-9
lines changed

jablib/src/main/java/org/jabref/logic/importer/fetcher/INSPIREFetcher.java

Lines changed: 179 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package org.jabref.logic.importer.fetcher;
22

3+
import java.io.IOException;
34
import java.net.MalformedURLException;
45
import java.net.URI;
56
import java.net.URISyntaxException;
@@ -29,15 +30,41 @@
2930

3031
import org.apache.hc.core5.net.URIBuilder;
3132
import org.jspecify.annotations.NonNull;
33+
import org.slf4j.Logger;
34+
import org.slf4j.LoggerFactory;
3235

3336
/**
3437
* Fetches data from the INSPIRE database.
38+
*
39+
* Enhanced version with:
40+
* - Retry mechanism for network failures
41+
* - Better error handling and logging
42+
* - Validation of fetched data
43+
* - Optimized request headers
3544
*/
3645
public class INSPIREFetcher implements SearchBasedParserFetcher, EntryBasedFetcher {
3746

47+
private static final Logger LOGGER = LoggerFactory.getLogger(INSPIREFetcher.class);
48+
3849
private static final String INSPIRE_HOST = "https://inspirehep.net/api/literature/";
3950
private static final String INSPIRE_DOI_HOST = "https://inspirehep.net/api/doi/";
4051
private static final String INSPIRE_ARXIV_HOST = "https://inspirehep.net/api/arxiv/";
52+
53+
// Retry configuration
54+
private static final int MAX_RETRIES = 3;
55+
private static final long RETRY_DELAY_MS = 1000; // 1 second base delay
56+
57+
// Timeout configuration (in milliseconds)
58+
private static final int CONNECT_TIMEOUT_MS = 10000; // 10 seconds
59+
private static final int READ_TIMEOUT_MS = 30000; // 30 seconds
60+
61+
private static final String ERROR_MESSAGE_TEMPLATE =
62+
"Failed to fetch from INSPIRE using %s after %d attempts.\n" +
63+
"Possible causes:\n" +
64+
"- Network connection issue\n" +
65+
"- INSPIRE service temporarily unavailable\n" +
66+
"- Invalid identifier format\n" +
67+
"Please check your internet connection and try again.";
4168

4269
private final ImportFormatPreferences importFormatPreferences;
4370

@@ -65,7 +92,15 @@ public URL getURLForQuery(BaseQueryNode queryNode) throws URISyntaxException, Ma
6592
@Override
6693
public URLDownload getUrlDownload(URL url) {
6794
URLDownload download = new URLDownload(url);
95+
96+
// Set comprehensive headers
6897
download.addHeader("Accept", MediaTypes.APPLICATION_BIBTEX);
98+
download.addHeader("User-Agent", "JabRef/" + getClass().getPackage().getImplementationVersion());
99+
100+
// Set timeouts to prevent hanging
101+
download.setConnectTimeout(CONNECT_TIMEOUT_MS);
102+
download.setReadTimeout(READ_TIMEOUT_MS);
103+
69104
return download;
70105
}
71106

@@ -78,6 +113,13 @@ public void doPostCleanup(BibEntry entry) {
78113
new FieldFormatterCleanup(StandardField.TITLE, new RemoveEnclosingBracesFormatter()).cleanup(entry);
79114

80115
new FieldFormatterCleanup(StandardField.TITLE, new LatexToUnicodeFormatter()).cleanup(entry);
116+
117+
// Log the citation key for debugging
118+
if (LOGGER.isDebugEnabled()) {
119+
entry.getCitationKey().ifPresent(key ->
120+
LOGGER.debug("Post-cleanup citation key: {}", key)
121+
);
122+
}
81123
}
82124

83125
@Override
@@ -92,28 +134,156 @@ public List<BibEntry> performSearch(@NonNull BibEntry entry) throws FetcherExcep
92134
Optional<String> eprint = entry.getField(StandardField.EPRINT);
93135

94136
String urlString;
95-
if (archiveprefix.filter("arxiv"::equals).isPresent() && eprint.isPresent()) {
137+
String identifier;
138+
139+
// Prioritize arXiv (INSPIRE has best support for arXiv identifiers)
140+
if (archiveprefix.filter("arxiv"::equalsIgnoreCase).isPresent() && eprint.isPresent()) {
96141
urlString = INSPIRE_ARXIV_HOST + eprint.get();
142+
identifier = "arXiv:" + eprint.get();
143+
LOGGER.debug("Using INSPIRE arXiv endpoint for: {}", identifier);
97144
} else if (doi.isPresent()) {
98145
urlString = INSPIRE_DOI_HOST + doi.get();
146+
identifier = "DOI:" + doi.get();
147+
LOGGER.debug("Using INSPIRE DOI endpoint for: {}", identifier);
99148
} else {
149+
LOGGER.debug("No suitable identifier found for INSPIRE search");
100150
return List.of();
101151
}
102152

103153
URL url;
104154
try {
105155
url = new URI(urlString).toURL();
106156
} catch (MalformedURLException | URISyntaxException e) {
107-
throw new FetcherException("Invalid URL", e);
157+
throw new FetcherException("Invalid INSPIRE URL: " + urlString, e);
108158
}
109159

110-
try {
111-
URLDownload download = getUrlDownload(url);
112-
List<BibEntry> results = getParser().parseEntries(download.asInputStream());
113-
results.forEach(this::doPostCleanup);
114-
return results;
115-
} catch (ParseException e) {
116-
throw new FetcherException(url, e);
160+
// Use retry mechanism for robust fetching
161+
List<BibEntry> results = performSearchWithRetry(url, identifier);
162+
163+
// Validate and log results
164+
validateResults(results, identifier);
165+
166+
return results;
167+
}
168+
169+
/**
170+
* Performs the search with automatic retry on failure.
171+
* Implements exponential backoff for retries.
172+
*
173+
* @param url The URL to fetch from
174+
* @param identifier Human-readable identifier for logging
175+
* @return List of fetched BibEntry objects
176+
* @throws FetcherException if all retry attempts fail
177+
*/
178+
private List<BibEntry> performSearchWithRetry(URL url, String identifier) throws FetcherException {
179+
int attempt = 0;
180+
FetcherException lastException = null;
181+
182+
while (attempt < MAX_RETRIES) {
183+
try {
184+
LOGGER.info("Fetching from INSPIRE (attempt {}/{}): {} [{}]",
185+
attempt + 1, MAX_RETRIES, url, identifier);
186+
187+
URLDownload download = getUrlDownload(url);
188+
List<BibEntry> results = getParser().parseEntries(download.asInputStream());
189+
190+
// Log success
191+
if (results.isEmpty()) {
192+
LOGGER.warn("INSPIRE returned empty results for: {} [{}]", url, identifier);
193+
} else {
194+
LOGGER.info("Successfully fetched {} entries from INSPIRE for [{}]",
195+
results.size(), identifier);
196+
}
197+
198+
// Apply post-processing
199+
results.forEach(this::doPostCleanup);
200+
return results;
201+
202+
} catch (ParseException | IOException e) {
203+
lastException = new FetcherException(url,
204+
"Failed to fetch from INSPIRE (attempt " + (attempt + 1) + "): " + e.getMessage(), e);
205+
206+
LOGGER.warn("Fetch attempt {} failed for [{}]: {}",
207+
attempt + 1, identifier, e.getMessage());
208+
209+
attempt++;
210+
211+
// Implement exponential backoff for retries
212+
if (attempt < MAX_RETRIES) {
213+
long delay = RETRY_DELAY_MS * (long) Math.pow(2, attempt - 1);
214+
LOGGER.info("Retrying in {} ms...", delay);
215+
216+
try {
217+
Thread.sleep(delay);
218+
} catch (InterruptedException ie) {
219+
Thread.currentThread().interrupt();
220+
throw new FetcherException("Interrupted during retry for [" + identifier + "]", ie);
221+
}
222+
}
223+
}
224+
}
225+
226+
// All retries failed
227+
throw new FetcherException(
228+
String.format(ERROR_MESSAGE_TEMPLATE, identifier, MAX_RETRIES),
229+
lastException
230+
);
231+
}
232+
233+
/**
234+
* Validates the fetched results and logs warnings for potential issues.
235+
* This helps identify when INSPIRE returns data but without proper texkeys.
236+
*
237+
* @param results The list of fetched entries
238+
* @param identifier The identifier used for fetching
239+
*/
240+
private void validateResults(List<BibEntry> results, String identifier) {
241+
if (results.isEmpty()) {
242+
return;
243+
}
244+
245+
for (BibEntry entry : results) {
246+
// Check for citation key
247+
if (!entry.hasCitationKey()) {
248+
LOGGER.warn("Entry from INSPIRE [{}] has no citation key - may need fallback generation",
249+
identifier);
250+
} else {
251+
String citationKey = entry.getCitationKey().orElse("");
252+
253+
// Check for problematic citation keys (URLs, DOIs, etc.)
254+
if (citationKey.startsWith("http") ||
255+
citationKey.startsWith("https") ||
256+
citationKey.startsWith("doi:") ||
257+
citationKey.contains("://")) {
258+
259+
LOGGER.warn("Entry has URL-like citation key: '{}' [{}] - cleanup may be needed",
260+
citationKey, identifier);
261+
} else if (citationKey.length() > 100) {
262+
LOGGER.warn("Entry has unusually long citation key ({} chars) [{}] - cleanup may be needed",
263+
citationKey.length(), identifier);
264+
} else {
265+
LOGGER.info("Got valid citation key: '{}' [{}]", citationKey, identifier);
266+
}
267+
}
268+
269+
// Check for required fields
270+
if (entry.getField(StandardField.TITLE).isEmpty()) {
271+
LOGGER.warn("Entry from INSPIRE [{}] has no title", identifier);
272+
}
273+
274+
if (entry.getField(StandardField.AUTHOR).isEmpty()) {
275+
LOGGER.warn("Entry from INSPIRE [{}] has no author", identifier);
276+
}
277+
278+
// Log whether journal information is present (helps verify we got published version)
279+
boolean hasJournalInfo = entry.getField(StandardField.JOURNAL).isPresent() ||
280+
entry.getField(StandardField.JOURNALTITLE).isPresent();
281+
if (hasJournalInfo) {
282+
LOGGER.debug("Entry [{}] includes journal publication info", identifier);
283+
} else {
284+
LOGGER.debug("Entry [{}] has no journal info (may be preprint only)", identifier);
285+
}
117286
}
118287
}
119288
}
289+

test_inspire_retry.md

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# INSPIRE Fetcher 测试指南
2+
3+
## 测试 Issue #12292 - 3.2 部分(重试机制和验证)
4+
5+
### 测试用例 1:正常 arXiv 搜索
6+
**目标**:验证能成功获取数据
7+
```
8+
输入:arXiv 编号 2409.15408
9+
期望:
10+
- 成功获取数据
11+
- 有日志输出:INFO "Fetching from INSPIRE..."
12+
- 有日志输出:INFO "Successfully fetched X entries"
13+
```
14+
15+
### 测试用例 2:网络重试
16+
**目标**:验证重试机制
17+
```
18+
模拟:网络超时
19+
期望:
20+
- 自动重试 3 次
21+
- 日志显示:WARN "Fetch attempt 1 failed"
22+
- 日志显示:INFO "Retrying in 1000 ms..."
23+
- 日志显示:INFO "Retrying in 2000 ms..."
24+
```
25+
26+
### 测试用例 3:结果验证
27+
**目标**:验证 citation key 质量检查
28+
```
29+
场景 A:返回正常 citation key
30+
期望:INFO "Got valid citation key: 'Author:2024abc'"
31+
32+
场景 B:返回 URL 形式的 key
33+
期望:WARN "Entry has URL-like citation key: 'https://...'"
34+
35+
场景 C:返回超长 key
36+
期望:WARN "Entry has unusually long citation key (150 chars)"
37+
```
38+
39+
### 测试用例 4:日志完整性
40+
**期望的日志输出**
41+
```
42+
[DEBUG] Using INSPIRE arXiv endpoint for: arXiv:2409.15408
43+
[INFO] Fetching from INSPIRE (attempt 1/3): https://... [arXiv:2409.15408]
44+
[INFO] Successfully fetched 1 entries from INSPIRE for [arXiv:2409.15408]
45+
[DEBUG] Post-cleanup citation key: Author:2024abc
46+
[INFO] Got valid citation key: 'Author:2024abc' [arXiv:2409.15408]
47+
[DEBUG] Entry [arXiv:2409.15408] includes journal publication info
48+
```
49+
50+
## 如何测试
51+
52+
### 方法 1:IntelliJ 中运行
53+
1. 在 IntelliJ 中打开项目
54+
2. 运行 JabRef
55+
3. 打开 Web Search,选择 INSPIRE
56+
4. 搜索 arXiv 编号
57+
5. 查看控制台日志
58+
59+
### 方法 2:查看日志文件
60+
JabRef 的日志文件通常在:
61+
- macOS: `~/Library/Logs/JabRef/`
62+
- Windows: `%APPDATA%\JabRef\logs\`
63+
- Linux: `~/.local/share/JabRef/logs/`
64+
65+
### 方法 3:单元测试
66+
```java
67+
@Test
68+
void testArxivFetchWithRetry() throws Exception {
69+
BibEntry entry = new BibEntry();
70+
entry.setField(StandardField.ARCHIVEPREFIX, "arXiv");
71+
entry.setField(StandardField.EPRINT, "2409.15408");
72+
73+
List<BibEntry> results = fetcher.performSearch(entry);
74+
75+
assertFalse(results.isEmpty());
76+
assertTrue(results.get(0).hasCitationKey());
77+
78+
// 检查 citation key 不是 URL
79+
String citationKey = results.get(0).getCitationKey().orElse("");
80+
assertFalse(citationKey.startsWith("http"));
81+
assertTrue(citationKey.length() < 100);
82+
}
83+
```
84+
85+
## 与团队协作测试
86+
87+
### 与同学 A(Core Logic)协作:
88+
```
89+
你的输出 → 同学A的输入
90+
验证数据质量 → 提取 texkeys
91+
```
92+
93+
### 与同学 C(Routing)协作:
94+
```
95+
同学C路由到INSPIRE → 你的重试机制生效
96+
```
97+
98+
### 与同学 D(Cleanup)协作:
99+
```
100+
你标记不良键 → 同学D清理
101+
```
102+
103+
## 成功标准
104+
105+
✅ 重试机制工作正常(3次重试,指数退避)
106+
✅ 超时控制生效(不会卡死)
107+
✅ 日志输出完整(INFO/WARN/DEBUG)
108+
✅ 能识别不良 citation key(URL、超长)
109+
✅ 错误提示友好
110+
111+
## 问题排查
112+
113+
如果测试失败:
114+
1. 检查网络连接
115+
2. 查看日志文件
116+
3. 验证 INSPIRE API 是否可访问
117+
4. 确认 arXiv 编号格式正确
118+

0 commit comments

Comments
 (0)