11package org .jabref .logic .importer .fetcher ;
22
3+ import java .io .IOException ;
34import java .net .MalformedURLException ;
45import java .net .URI ;
56import java .net .URISyntaxException ;
2930
3031import org .apache .hc .core5 .net .URIBuilder ;
3132import org .jspecify .annotations .NonNull ;
33+ import org .slf4j .Logger ;
34+ import org .slf4j .LoggerFactory ;
3235
3336/**
3437 * Fetches data from the INSPIRE database.
38+ *
39+ * Enhanced version with:
40+ * - Retry mechanism for network failures
41+ * - Better error handling and logging
42+ * - Validation of fetched data
43+ * - Optimized request headers
3544 */
3645public class INSPIREFetcher implements SearchBasedParserFetcher , EntryBasedFetcher {
3746
47+ private static final Logger LOGGER = LoggerFactory .getLogger (INSPIREFetcher .class );
48+
3849 private static final String INSPIRE_HOST = "https://inspirehep.net/api/literature/" ;
3950 private static final String INSPIRE_DOI_HOST = "https://inspirehep.net/api/doi/" ;
4051 private static final String INSPIRE_ARXIV_HOST = "https://inspirehep.net/api/arxiv/" ;
52+
53+ // Retry configuration
54+ private static final int MAX_RETRIES = 3 ;
55+ private static final long RETRY_DELAY_MS = 1000 ; // 1 second base delay
56+
57+ // Timeout configuration (in milliseconds)
58+ private static final int CONNECT_TIMEOUT_MS = 10000 ; // 10 seconds
59+ private static final int READ_TIMEOUT_MS = 30000 ; // 30 seconds
60+
61+ private static final String ERROR_MESSAGE_TEMPLATE =
62+ "Failed to fetch from INSPIRE using %s after %d attempts.\n " +
63+ "Possible causes:\n " +
64+ "- Network connection issue\n " +
65+ "- INSPIRE service temporarily unavailable\n " +
66+ "- Invalid identifier format\n " +
67+ "Please check your internet connection and try again." ;
4168
4269 private final ImportFormatPreferences importFormatPreferences ;
4370
@@ -65,7 +92,15 @@ public URL getURLForQuery(BaseQueryNode queryNode) throws URISyntaxException, Ma
6592 @ Override
6693 public URLDownload getUrlDownload (URL url ) {
6794 URLDownload download = new URLDownload (url );
95+
96+ // Set comprehensive headers
6897 download .addHeader ("Accept" , MediaTypes .APPLICATION_BIBTEX );
98+ download .addHeader ("User-Agent" , "JabRef/" + getClass ().getPackage ().getImplementationVersion ());
99+
100+ // Set timeouts to prevent hanging
101+ download .setConnectTimeout (CONNECT_TIMEOUT_MS );
102+ download .setReadTimeout (READ_TIMEOUT_MS );
103+
69104 return download ;
70105 }
71106
@@ -78,6 +113,13 @@ public void doPostCleanup(BibEntry entry) {
78113 new FieldFormatterCleanup (StandardField .TITLE , new RemoveEnclosingBracesFormatter ()).cleanup (entry );
79114
80115 new FieldFormatterCleanup (StandardField .TITLE , new LatexToUnicodeFormatter ()).cleanup (entry );
116+
117+ // Log the citation key for debugging
118+ if (LOGGER .isDebugEnabled ()) {
119+ entry .getCitationKey ().ifPresent (key ->
120+ LOGGER .debug ("Post-cleanup citation key: {}" , key )
121+ );
122+ }
81123 }
82124
83125 @ Override
@@ -92,28 +134,156 @@ public List<BibEntry> performSearch(@NonNull BibEntry entry) throws FetcherExcep
92134 Optional <String > eprint = entry .getField (StandardField .EPRINT );
93135
94136 String urlString ;
95- if (archiveprefix .filter ("arxiv" ::equals ).isPresent () && eprint .isPresent ()) {
137+ String identifier ;
138+
139+ // Prioritize arXiv (INSPIRE has best support for arXiv identifiers)
140+ if (archiveprefix .filter ("arxiv" ::equalsIgnoreCase ).isPresent () && eprint .isPresent ()) {
96141 urlString = INSPIRE_ARXIV_HOST + eprint .get ();
142+ identifier = "arXiv:" + eprint .get ();
143+ LOGGER .debug ("Using INSPIRE arXiv endpoint for: {}" , identifier );
97144 } else if (doi .isPresent ()) {
98145 urlString = INSPIRE_DOI_HOST + doi .get ();
146+ identifier = "DOI:" + doi .get ();
147+ LOGGER .debug ("Using INSPIRE DOI endpoint for: {}" , identifier );
99148 } else {
149+ LOGGER .debug ("No suitable identifier found for INSPIRE search" );
100150 return List .of ();
101151 }
102152
103153 URL url ;
104154 try {
105155 url = new URI (urlString ).toURL ();
106156 } catch (MalformedURLException | URISyntaxException e ) {
107- throw new FetcherException ("Invalid URL" , e );
157+ throw new FetcherException ("Invalid INSPIRE URL: " + urlString , e );
108158 }
109159
110- try {
111- URLDownload download = getUrlDownload (url );
112- List <BibEntry > results = getParser ().parseEntries (download .asInputStream ());
113- results .forEach (this ::doPostCleanup );
114- return results ;
115- } catch (ParseException e ) {
116- throw new FetcherException (url , e );
160+ // Use retry mechanism for robust fetching
161+ List <BibEntry > results = performSearchWithRetry (url , identifier );
162+
163+ // Validate and log results
164+ validateResults (results , identifier );
165+
166+ return results ;
167+ }
168+
169+ /**
170+ * Performs the search with automatic retry on failure.
171+ * Implements exponential backoff for retries.
172+ *
173+ * @param url The URL to fetch from
174+ * @param identifier Human-readable identifier for logging
175+ * @return List of fetched BibEntry objects
176+ * @throws FetcherException if all retry attempts fail
177+ */
178+ private List <BibEntry > performSearchWithRetry (URL url , String identifier ) throws FetcherException {
179+ int attempt = 0 ;
180+ FetcherException lastException = null ;
181+
182+ while (attempt < MAX_RETRIES ) {
183+ try {
184+ LOGGER .info ("Fetching from INSPIRE (attempt {}/{}): {} [{}]" ,
185+ attempt + 1 , MAX_RETRIES , url , identifier );
186+
187+ URLDownload download = getUrlDownload (url );
188+ List <BibEntry > results = getParser ().parseEntries (download .asInputStream ());
189+
190+ // Log success
191+ if (results .isEmpty ()) {
192+ LOGGER .warn ("INSPIRE returned empty results for: {} [{}]" , url , identifier );
193+ } else {
194+ LOGGER .info ("Successfully fetched {} entries from INSPIRE for [{}]" ,
195+ results .size (), identifier );
196+ }
197+
198+ // Apply post-processing
199+ results .forEach (this ::doPostCleanup );
200+ return results ;
201+
202+ } catch (ParseException | IOException e ) {
203+ lastException = new FetcherException (url ,
204+ "Failed to fetch from INSPIRE (attempt " + (attempt + 1 ) + "): " + e .getMessage (), e );
205+
206+ LOGGER .warn ("Fetch attempt {} failed for [{}]: {}" ,
207+ attempt + 1 , identifier , e .getMessage ());
208+
209+ attempt ++;
210+
211+ // Implement exponential backoff for retries
212+ if (attempt < MAX_RETRIES ) {
213+ long delay = RETRY_DELAY_MS * (long ) Math .pow (2 , attempt - 1 );
214+ LOGGER .info ("Retrying in {} ms..." , delay );
215+
216+ try {
217+ Thread .sleep (delay );
218+ } catch (InterruptedException ie ) {
219+ Thread .currentThread ().interrupt ();
220+ throw new FetcherException ("Interrupted during retry for [" + identifier + "]" , ie );
221+ }
222+ }
223+ }
224+ }
225+
226+ // All retries failed
227+ throw new FetcherException (
228+ String .format (ERROR_MESSAGE_TEMPLATE , identifier , MAX_RETRIES ),
229+ lastException
230+ );
231+ }
232+
233+ /**
234+ * Validates the fetched results and logs warnings for potential issues.
235+ * This helps identify when INSPIRE returns data but without proper texkeys.
236+ *
237+ * @param results The list of fetched entries
238+ * @param identifier The identifier used for fetching
239+ */
240+ private void validateResults (List <BibEntry > results , String identifier ) {
241+ if (results .isEmpty ()) {
242+ return ;
243+ }
244+
245+ for (BibEntry entry : results ) {
246+ // Check for citation key
247+ if (!entry .hasCitationKey ()) {
248+ LOGGER .warn ("Entry from INSPIRE [{}] has no citation key - may need fallback generation" ,
249+ identifier );
250+ } else {
251+ String citationKey = entry .getCitationKey ().orElse ("" );
252+
253+ // Check for problematic citation keys (URLs, DOIs, etc.)
254+ if (citationKey .startsWith ("http" ) ||
255+ citationKey .startsWith ("https" ) ||
256+ citationKey .startsWith ("doi:" ) ||
257+ citationKey .contains ("://" )) {
258+
259+ LOGGER .warn ("Entry has URL-like citation key: '{}' [{}] - cleanup may be needed" ,
260+ citationKey , identifier );
261+ } else if (citationKey .length () > 100 ) {
262+ LOGGER .warn ("Entry has unusually long citation key ({} chars) [{}] - cleanup may be needed" ,
263+ citationKey .length (), identifier );
264+ } else {
265+ LOGGER .info ("Got valid citation key: '{}' [{}]" , citationKey , identifier );
266+ }
267+ }
268+
269+ // Check for required fields
270+ if (entry .getField (StandardField .TITLE ).isEmpty ()) {
271+ LOGGER .warn ("Entry from INSPIRE [{}] has no title" , identifier );
272+ }
273+
274+ if (entry .getField (StandardField .AUTHOR ).isEmpty ()) {
275+ LOGGER .warn ("Entry from INSPIRE [{}] has no author" , identifier );
276+ }
277+
278+ // Log whether journal information is present (helps verify we got published version)
279+ boolean hasJournalInfo = entry .getField (StandardField .JOURNAL ).isPresent () ||
280+ entry .getField (StandardField .JOURNALTITLE ).isPresent ();
281+ if (hasJournalInfo ) {
282+ LOGGER .debug ("Entry [{}] includes journal publication info" , identifier );
283+ } else {
284+ LOGGER .debug ("Entry [{}] has no journal info (may be preprint only)" , identifier );
285+ }
117286 }
118287 }
119288}
289+
0 commit comments