2626import threading
2727
2828from google_scholar_py import CustomGoogleScholarOrganic
29+ from .string_similarity import StringSimilarity
2930
3031logger = logging .getLogger (__name__ )
3132
@@ -71,7 +72,10 @@ def __init__(self, persist_store=None, web_data_folder="", meta_folder="") -> No
7172
7273 self .web_data_folder = web_data_folder
7374
74- self .request_interval = 5
75+ self .request_interval = 10
76+
77+ def _formulate_query (self , query ):
78+ return re .sub (r"[^a-zA-Z0-9, ]" , "_" , query .strip ())
7579
7680 def safe_request (self , driver , link ):
7781 with BibSearchGoogleScholar .google_scholar_request_lock :
@@ -80,13 +84,13 @@ def safe_request(self, driver, link):
8084 interval = time .time () - BibSearchGoogleScholar .last_request_google_scholar
8185 if interval < self .request_interval :
8286 time_to_wait = (
83- random .uniform (self .request_interval , self .request_interval + 5 )
87+ random .uniform (self .request_interval , self .request_interval + 6 )
8488 - interval
8589 )
8690
8791 time .sleep (time_to_wait )
8892
89- logger .info (f"Time Issues: { time .time ()} - { time_to_wait } { link } " )
93+ logger .warning (f"Time Issues: { time .time ()} - { time_to_wait } { link } " )
9094
9195 driver .get (link )
9296
@@ -429,7 +433,8 @@ def _get_cited_by_paper_names(self, driver, link, max_results=50):
429433 if page_num == 0 :
430434 refined_link = f"{ link } "
431435 else :
432- refined_link = f"{ link_header } ?start={ str (page_num )} &{ link_params ['hl' ]} &{ link_params ['as_sdt' ]} &{ link_params ['sciodt' ]} &{ link_params ['cites' ]} &scipsc="
436+ refined_link = f"{ link_header } ?start={ str (page_num )} &{ link_params ['hl' ]} &{ link_params ['as_sdt' ]} &{ link_params ['sciodt' ]} &{ link_params ['cites' ]} "
437+ # refined_link = f"{link_header}?start={str(page_num)}&{link_params['hl']}&{link_params['as_sdt']}&{link_params['sciodt']}&{link_params['cites']}&scipsc="
433438
434439 driver = self .safe_request (
435440 driver = driver ,
@@ -451,8 +456,9 @@ def _get_cited_by_paper_names(self, driver, link, max_results=50):
451456 if (
452457 "not a robot" in driver .page_source
453458 or "may be sending automated queries" in driver .page_source
459+ or "您的计算机网络中存在异常流量" in driver .page_source
454460 ):
455- logger .error ("Detected as a spider " )
461+ logger .error ("============== DETECTED AS A SPIDER =============== " )
456462 parser = LexborHTMLParser (driver .page_source )
457463
458464 if get_content :
@@ -496,9 +502,21 @@ def parse(
496502 title = str (time .time ())
497503
498504 if mode == "exact" :
499- similarity = difflib .SequenceMatcher (
500- None , title .lower (), query .lower ()
501- ).ratio ()
505+ if self ._formulate_query (title ).lower () in query .lower ():
506+ similarity = 1
507+ else :
508+ similarity = StringSimilarity .ratio_similarity (
509+ title .lower (), query .lower ()
510+ )
511+ # similarity = StringSimilarity.semantic_similarity(
512+ # title.lower(), query.lower()
513+ # )
514+ # similarity = difflib.SequenceMatcher(
515+ # None, title.lower(), query.lower()
516+ # ).ratio()
517+ similarity = StringSimilarity .semantic_similarity (
518+ title .lower (), query .lower ()
519+ )
502520 logger .info (
503521 f"Scholar compared with: { query } , Found paper: { title } with similarity { similarity } "
504522 )
@@ -524,9 +542,11 @@ def parse(
524542 outputs .append (this_bib )
525543
526544 elif action == "download" :
545+ logger .error ("start to download" )
527546 succ , file_path , file_name , exist = self .download (
528547 driver , title , result , download_path
529548 )
549+ logger .error ("finish to download" )
530550
531551 if not succ and not exist :
532552 logger .warning (f"Found { title } , but download failed." )
@@ -555,7 +575,6 @@ def parse(
555575 driver , title , result , cite_directory
556576 )
557577
558- logger .error (f"already get bib: { this_bib } " )
559578 if (
560579 "cited_by_link" in this_bib
561580 and this_bib ["cited_by_link" ] is not None
@@ -564,7 +583,6 @@ def parse(
564583 this_bib ["cited_by" ] = self ._get_cited_by_paper_names (
565584 driver , this_bib ["cited_by_link" ]
566585 )
567- logger .error (f"finish use this bib" )
568586 except Exception as e :
569587 if this_bib is None :
570588 meta_file_path = None
@@ -574,10 +592,14 @@ def parse(
574592 self .persist_store .save_state (
575593 self .meta_folder , file_name , this_bib
576594 )
577- if meta_file_path :
595+ if succ and meta_file_path :
578596 outputs .append ((True , file_path , meta_file_path , exist ))
579- else :
580- outputs .append ((succ , file_path , meta_file_path , exist ))
597+ elif succ and not meta_file_path :
598+ outputs .append ((True , file_path , meta_file_path , exist ))
599+ elif not succ and meta_file_path :
600+ outputs .append ((True , None , meta_file_path , exist ))
601+ elif not succ and not meta_file_path :
602+ outputs .append ((False , None , meta_file_path , exist ))
581603
582604 if len (outputs ) >= num_per_page :
583605 break
@@ -721,7 +743,7 @@ def search_by_name(
721743
722744 page_num = 0
723745 organic_results_data = []
724- pruned_query = re . sub ( r"[^a-zA-Z0-9, ]" , "_" , query . strip () )
746+ pruned_query = self . _formulate_query ( query )
725747 logger .info (f"pruned query { pruned_query } " )
726748
727749 # parse all pages
@@ -800,7 +822,8 @@ def download(self, driver, title, result, download_path):
800822 pdf_link : str = result .css_first (".gs_or_ggsm a" ).attrs ["href" ]
801823
802824 headers = {
803- "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
825+ "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36" ,
826+ "referer" : "https://scholar.google.com/" ,
804827 }
805828 response = requests .get (pdf_link , headers = headers )
806829
@@ -811,18 +834,19 @@ def download(self, driver, title, result, download_path):
811834 return True , file_path , scholar_name , False
812835 else :
813836 logger .warning (
814- f"Failed to download. Status code: { response .status_code } "
837+ f"Failed to download. Status code: { response .status_code } . Try to fix ... "
815838 )
839+
816840 with open ("fail_log.log" , "a" ) as f :
817841 f .write (file_path + "\n " )
818- f .write (pdf_link + "\n " )
842+ # f.write(pdf_link + "\n")
819843 f .write ("STATUS CODE: " + str (response .status_code ) + "\n " )
820844 f .write ("\n " )
821845 except Exception as e :
822846 logger .error (f"Download failed: { e } " )
823847 with open ("fail_log.log" , "a" ) as f :
824848 f .write (file_path + "\n " )
825- f .write (pdf_link + "\n " )
849+ # f.write(pdf_link + "\n")
826850 f .write (str (e ) + "\n " )
827851 f .write ("\n " )
828852
@@ -837,9 +861,18 @@ def condition_met(driver):
837861 except Exception :
838862 element_present = False
839863
864+ if not element_present :
865+ try :
866+ element_present = EC .presence_of_element_located (
867+ (By .ID , "gs_res_ccl_mid" )
868+ )(driver )
869+ except Exception :
870+ element_present = False
871+
840872 text_present = (
841873 "not a robot" in driver .page_source
842874 or "may be sending automated queries" in driver .page_source
875+ or "您的计算机网络中存在异常流量" in driver .page_source
843876 )
844877
845878 return element_present or text_present
@@ -876,7 +909,7 @@ def download_by_name(
876909 if pagination :
877910 while page_num <= 10 :
878911 # parse all pages (the first two pages)
879- pruned_query = re . sub ( r"[^a-zA-Z0-9, ]" , "_" , query . strip () )
912+ pruned_query = self . _formulate_query ( query )
880913 driver = self .safe_request (
881914 driver = driver ,
882915 link = f"https://scholar.google.com/scholar?q={ pruned_query } &hl=en&gl=us&start={ page_num } " ,
@@ -911,27 +944,53 @@ def download_by_name(
911944 else :
912945 # parse first page only
913946 # logger.error("### START TO DOWNLOAD #####")
914- pruned_query = re . sub ( r"[^a-zA-Z0-9, ]" , "_" , query . strip () )
947+ pruned_query = self . _formulate_query ( query )
915948 # logger.error(pruned_query)
916949
917- driver = self .safe_request (
918- driver = driver ,
919- link = f"https://scholar.google.com/scholar?q={ pruned_query } &hl=en&gl=us&start={ page_num } " ,
920- )
921-
922- WebDriverWait (driver , 10 ).until (self .finish_load_condition ())
950+ retry_times = 0
951+ max_retry_times = 3
923952
924- parser = LexborHTMLParser (driver .page_source )
953+ while retry_times <= max_retry_times :
954+ retry_times += 1
955+ driver = self .safe_request (
956+ driver = driver ,
957+ link = f"https://scholar.google.com/scholar?q={ pruned_query } &hl=zh-cn&gl=us&as_sdt=0,5&start={ page_num } " ,
958+ )
925959
926- if len (parser .css (".gs_r.gs_or.gs_scl" )) == 0 :
927- if "not a robot" in driver .page_source :
928- logger .error (
929- f"============== DETECTED AS A ROBOT { query } ============="
960+ try :
961+ WebDriverWait (driver , 10 ).until (
962+ self .finish_load_condition ()
930963 )
931- # with open("fail_log.log", "a") as f:
932- # f.write(query + "\n")
933- # f.write("no label\n")
934- # f.write(driver.page_source)
964+ except TimeoutException as e :
965+ logger .error (f"Cannot Get Cited by Timeout Error: { e } " )
966+ except Exception as e :
967+ logger .error (f"Cannot Get Cited by Error: { e } " )
968+
969+ parser = LexborHTMLParser (driver .page_source )
970+
971+ if len (parser .css (".gs_r.gs_or.gs_scl" )) == 0 :
972+ if (
973+ "not a robot" in driver .page_source
974+ or "may be sending automated queries"
975+ in driver .page_source
976+ or "您的计算机网络中存在异常流量" in driver .page_source
977+ ):
978+ logger .error (
979+ f"============== DETECTED AS A ROBOT { query } ============="
980+ )
981+ logger .error (
982+ f"https://scholar.google.com/scholar?q={ pruned_query } &hl=zh-cn&gl=us&start={ page_num } "
983+ )
984+ logger .error (
985+ f"===== TO RETRY { retry_times } /{ max_retry_times } "
986+ )
987+ time .sleep (random .uniform (8 , 15 ))
988+ # with open("fail_log.log", "a") as f:
989+ # f.write(query + "\n")
990+ # f.write("no label\n")
991+ # f.write(driver.page_source)
992+ else :
993+ break
935994
936995 succ_list = self .parse (
937996 driver , query , parser , mode , "download" , download_path
0 commit comments