Skip to content

Commit 856e546

Browse files
authored
Merge pull request #634 from GraphScope/refactor_workflow
fix(graphy): Fix bug of retrieving more than one result from arxiv
2 parents f74ecd1 + cea25b5 commit 856e546

File tree

6 files changed

+172
-40
lines changed

6 files changed

+172
-40
lines changed

python/graphy/apps/paper_reading/paper_navigate_edge.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ def execute(
167167
link_queue = Queue()
168168
output_queue = Queue()
169169

170-
logger.info("================= START NAVIGATE ==============")
170+
logger.warning(f"================= START NAVIGATE ==============")
171171
paper_data_id_list = []
172172
for paper in input:
173173
if not paper:
@@ -379,7 +379,7 @@ def download_worker(self, link_queue):
379379
download_folder=self.paper_download_dir,
380380
meta_folder=self.meta_folder_dir,
381381
)
382-
download_list = arxiv_fetcher.download_paper(link, 1)
382+
download_list = arxiv_fetcher.download_paper(link, 5)
383383

384384
if len(download_list) == 0:
385385
logger.info(f"PASS {link} to SCHOLAR FOR FURTHER SEARCH")
@@ -493,7 +493,7 @@ def download_worker(self, scholar_link_queue):
493493
scholar_link_queue.task_done()
494494
continue
495495

496-
logger.info(
496+
logger.error(
497497
f"-------------- SCHOLAR DOWNLOAD WORKER: {link} ------------------"
498498
)
499499

@@ -525,7 +525,7 @@ def download_worker(self, scholar_link_queue):
525525

526526
scholar_link_queue.task_done()
527527

528-
logger.info(
528+
logger.error(
529529
f"-------------- FINISH SCHOLAR WORKER: {link} ------------------"
530530
)
531531

python/graphy/apps/paper_reading/paper_reading_nodes.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,7 @@ def execute(
602602

603603
if not paper_file_path:
604604
logger.error("No 'paper_file_path' provided in input data.")
605+
logger.error(f"create fake extractor {paper_meta_path}")
605606
if not paper_meta_path:
606607
continue
607608
try:

python/graphy/utils/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from .paper_struct import Paper
99
from .timer import Timer
1010
from .bib_search import BibSearchGoogleScholar, BibSearchArxiv, BibSearchPubMed
11+
from .string_similarity import StringSimilarity
1112

1213
from .json_parser import (
1314
JsonParserType,
@@ -30,4 +31,5 @@
3031
"BibSearchGoogleScholar",
3132
"BibSearchArxiv",
3233
"BibSearchPubMed",
34+
"StringSimilarity",
3335
]

python/graphy/utils/arxiv_fetcher.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def __init__(
7070
:param timeout: The maximum time (in seconds) allowed for each paper fetching operation.
7171
:param download_folder: The folder where the fetched papers will be downloaded.
7272
"""
73-
self.client = arxiv.Client(delay_seconds=0.2, page_size=3, num_retries=1)
73+
self.client = arxiv.Client(delay_seconds=0.2, page_size=5, num_retries=1)
7474
self.timeout = timeout
7575
self.download_folder = download_folder
7676
self.bib_search_arxiv = BibSearchArxiv(

python/graphy/utils/bib_search.py

Lines changed: 94 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import threading
2727

2828
from google_scholar_py import CustomGoogleScholarOrganic
29+
from .string_similarity import StringSimilarity
2930

3031
logger = logging.getLogger(__name__)
3132

@@ -71,7 +72,10 @@ def __init__(self, persist_store=None, web_data_folder="", meta_folder="") -> No
7172

7273
self.web_data_folder = web_data_folder
7374

74-
self.request_interval = 5
75+
self.request_interval = 10
76+
77+
def _formulate_query(self, query):
78+
return re.sub(r"[^a-zA-Z0-9, ]", "_", query.strip())
7579

7680
def safe_request(self, driver, link):
7781
with BibSearchGoogleScholar.google_scholar_request_lock:
@@ -80,13 +84,13 @@ def safe_request(self, driver, link):
8084
interval = time.time() - BibSearchGoogleScholar.last_request_google_scholar
8185
if interval < self.request_interval:
8286
time_to_wait = (
83-
random.uniform(self.request_interval, self.request_interval + 5)
87+
random.uniform(self.request_interval, self.request_interval + 6)
8488
- interval
8589
)
8690

8791
time.sleep(time_to_wait)
8892

89-
logger.info(f"Time Issues: {time.time()} - {time_to_wait} {link}")
93+
logger.warning(f"Time Issues: {time.time()} - {time_to_wait} {link}")
9094

9195
driver.get(link)
9296

@@ -429,7 +433,8 @@ def _get_cited_by_paper_names(self, driver, link, max_results=50):
429433
if page_num == 0:
430434
refined_link = f"{link}"
431435
else:
432-
refined_link = f"{link_header}?start={str(page_num)}&{link_params['hl']}&{link_params['as_sdt']}&{link_params['sciodt']}&{link_params['cites']}&scipsc="
436+
refined_link = f"{link_header}?start={str(page_num)}&{link_params['hl']}&{link_params['as_sdt']}&{link_params['sciodt']}&{link_params['cites']}"
437+
# refined_link = f"{link_header}?start={str(page_num)}&{link_params['hl']}&{link_params['as_sdt']}&{link_params['sciodt']}&{link_params['cites']}&scipsc="
433438

434439
driver = self.safe_request(
435440
driver=driver,
@@ -451,8 +456,9 @@ def _get_cited_by_paper_names(self, driver, link, max_results=50):
451456
if (
452457
"not a robot" in driver.page_source
453458
or "may be sending automated queries" in driver.page_source
459+
or "您的计算机网络中存在异常流量" in driver.page_source
454460
):
455-
logger.error("Detected as a spider")
461+
logger.error("============== DETECTED AS A SPIDER ===============")
456462
parser = LexborHTMLParser(driver.page_source)
457463

458464
if get_content:
@@ -496,9 +502,21 @@ def parse(
496502
title = str(time.time())
497503

498504
if mode == "exact":
499-
similarity = difflib.SequenceMatcher(
500-
None, title.lower(), query.lower()
501-
).ratio()
505+
if self._formulate_query(title).lower() in query.lower():
506+
similarity = 1
507+
else:
508+
similarity = StringSimilarity.ratio_similarity(
509+
title.lower(), query.lower()
510+
)
511+
# similarity = StringSimilarity.semantic_similarity(
512+
# title.lower(), query.lower()
513+
# )
514+
# similarity = difflib.SequenceMatcher(
515+
# None, title.lower(), query.lower()
516+
# ).ratio()
517+
similarity = StringSimilarity.semantic_similarity(
518+
title.lower(), query.lower()
519+
)
502520
logger.info(
503521
f"Scholar compared with: {query}, Found paper: {title} with similarity {similarity}"
504522
)
@@ -524,9 +542,11 @@ def parse(
524542
outputs.append(this_bib)
525543

526544
elif action == "download":
545+
logger.error("start to download")
527546
succ, file_path, file_name, exist = self.download(
528547
driver, title, result, download_path
529548
)
549+
logger.error("finish to download")
530550

531551
if not succ and not exist:
532552
logger.warning(f"Found {title}, but download failed.")
@@ -555,7 +575,6 @@ def parse(
555575
driver, title, result, cite_directory
556576
)
557577

558-
logger.error(f"already get bib: {this_bib}")
559578
if (
560579
"cited_by_link" in this_bib
561580
and this_bib["cited_by_link"] is not None
@@ -564,7 +583,6 @@ def parse(
564583
this_bib["cited_by"] = self._get_cited_by_paper_names(
565584
driver, this_bib["cited_by_link"]
566585
)
567-
logger.error(f"finish use this bib")
568586
except Exception as e:
569587
if this_bib is None:
570588
meta_file_path = None
@@ -574,10 +592,14 @@ def parse(
574592
self.persist_store.save_state(
575593
self.meta_folder, file_name, this_bib
576594
)
577-
if meta_file_path:
595+
if succ and meta_file_path:
578596
outputs.append((True, file_path, meta_file_path, exist))
579-
else:
580-
outputs.append((succ, file_path, meta_file_path, exist))
597+
elif succ and not meta_file_path:
598+
outputs.append((True, file_path, meta_file_path, exist))
599+
elif not succ and meta_file_path:
600+
outputs.append((True, None, meta_file_path, exist))
601+
elif not succ and not meta_file_path:
602+
outputs.append((False, None, meta_file_path, exist))
581603

582604
if len(outputs) >= num_per_page:
583605
break
@@ -721,7 +743,7 @@ def search_by_name(
721743

722744
page_num = 0
723745
organic_results_data = []
724-
pruned_query = re.sub(r"[^a-zA-Z0-9, ]", "_", query.strip())
746+
pruned_query = self._formulate_query(query)
725747
logger.info(f"pruned query {pruned_query}")
726748

727749
# parse all pages
@@ -800,7 +822,8 @@ def download(self, driver, title, result, download_path):
800822
pdf_link: str = result.css_first(".gs_or_ggsm a").attrs["href"]
801823

802824
headers = {
803-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
825+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
826+
"referer": "https://scholar.google.com/",
804827
}
805828
response = requests.get(pdf_link, headers=headers)
806829

@@ -811,18 +834,19 @@ def download(self, driver, title, result, download_path):
811834
return True, file_path, scholar_name, False
812835
else:
813836
logger.warning(
814-
f"Failed to download. Status code: {response.status_code}"
837+
f"Failed to download. Status code: {response.status_code}. Try to fix ..."
815838
)
839+
816840
with open("fail_log.log", "a") as f:
817841
f.write(file_path + "\n")
818-
f.write(pdf_link + "\n")
842+
# f.write(pdf_link + "\n")
819843
f.write("STATUS CODE: " + str(response.status_code) + "\n")
820844
f.write("\n")
821845
except Exception as e:
822846
logger.error(f"Download failed: {e}")
823847
with open("fail_log.log", "a") as f:
824848
f.write(file_path + "\n")
825-
f.write(pdf_link + "\n")
849+
# f.write(pdf_link + "\n")
826850
f.write(str(e) + "\n")
827851
f.write("\n")
828852

@@ -837,9 +861,18 @@ def condition_met(driver):
837861
except Exception:
838862
element_present = False
839863

864+
if not element_present:
865+
try:
866+
element_present = EC.presence_of_element_located(
867+
(By.ID, "gs_res_ccl_mid")
868+
)(driver)
869+
except Exception:
870+
element_present = False
871+
840872
text_present = (
841873
"not a robot" in driver.page_source
842874
or "may be sending automated queries" in driver.page_source
875+
or "您的计算机网络中存在异常流量" in driver.page_source
843876
)
844877

845878
return element_present or text_present
@@ -876,7 +909,7 @@ def download_by_name(
876909
if pagination:
877910
while page_num <= 10:
878911
# parse all pages (the first two pages)
879-
pruned_query = re.sub(r"[^a-zA-Z0-9, ]", "_", query.strip())
912+
pruned_query = self._formulate_query(query)
880913
driver = self.safe_request(
881914
driver=driver,
882915
link=f"https://scholar.google.com/scholar?q={pruned_query}&hl=en&gl=us&start={page_num}",
@@ -911,27 +944,53 @@ def download_by_name(
911944
else:
912945
# parse first page only
913946
# logger.error("### START TO DOWNLOAD #####")
914-
pruned_query = re.sub(r"[^a-zA-Z0-9, ]", "_", query.strip())
947+
pruned_query = self._formulate_query(query)
915948
# logger.error(pruned_query)
916949

917-
driver = self.safe_request(
918-
driver=driver,
919-
link=f"https://scholar.google.com/scholar?q={pruned_query}&hl=en&gl=us&start={page_num}",
920-
)
921-
922-
WebDriverWait(driver, 10).until(self.finish_load_condition())
950+
retry_times = 0
951+
max_retry_times = 3
923952

924-
parser = LexborHTMLParser(driver.page_source)
953+
while retry_times <= max_retry_times:
954+
retry_times += 1
955+
driver = self.safe_request(
956+
driver=driver,
957+
link=f"https://scholar.google.com/scholar?q={pruned_query}&hl=zh-cn&gl=us&as_sdt=0,5&start={page_num}",
958+
)
925959

926-
if len(parser.css(".gs_r.gs_or.gs_scl")) == 0:
927-
if "not a robot" in driver.page_source:
928-
logger.error(
929-
f"============== DETECTED AS A ROBOT {query} ============="
960+
try:
961+
WebDriverWait(driver, 10).until(
962+
self.finish_load_condition()
930963
)
931-
# with open("fail_log.log", "a") as f:
932-
# f.write(query + "\n")
933-
# f.write("no label\n")
934-
# f.write(driver.page_source)
964+
except TimeoutException as e:
965+
logger.error(f"Cannot Get Cited by Timeout Error: {e}")
966+
except Exception as e:
967+
logger.error(f"Cannot Get Cited by Error: {e}")
968+
969+
parser = LexborHTMLParser(driver.page_source)
970+
971+
if len(parser.css(".gs_r.gs_or.gs_scl")) == 0:
972+
if (
973+
"not a robot" in driver.page_source
974+
or "may be sending automated queries"
975+
in driver.page_source
976+
or "您的计算机网络中存在异常流量" in driver.page_source
977+
):
978+
logger.error(
979+
f"============== DETECTED AS A ROBOT {query} ============="
980+
)
981+
logger.error(
982+
f"https://scholar.google.com/scholar?q={pruned_query}&hl=zh-cn&gl=us&start={page_num}"
983+
)
984+
logger.error(
985+
f"===== TO RETRY {retry_times}/{max_retry_times}"
986+
)
987+
time.sleep(random.uniform(8, 15))
988+
# with open("fail_log.log", "a") as f:
989+
# f.write(query + "\n")
990+
# f.write("no label\n")
991+
# f.write(driver.page_source)
992+
else:
993+
break
935994

936995
succ_list = self.parse(
937996
driver, query, parser, mode, "download", download_path

0 commit comments

Comments
 (0)