|
| 1 | +package scrape |
| 2 | + |
| 3 | +import ( |
| 4 | + "regexp" |
| 5 | + "strings" |
| 6 | + |
| 7 | + "github.com/PuerkitoBio/goquery" |
| 8 | + "github.com/gocolly/colly" |
| 9 | + "github.com/nleeper/goment" |
| 10 | + "github.com/xbapps/xbvr/pkg/models" |
| 11 | +) |
| 12 | + |
| 13 | +func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString string) { |
| 14 | + sceneCollector := createCollector("www.javdatabase.com") |
| 15 | + |
| 16 | + sceneCollector.OnHTML(`html`, func(html *colly.HTMLElement) { |
| 17 | + sc := models.ScrapedScene{} |
| 18 | + sc.SceneType = "VR" |
| 19 | + contentId := "" |
| 20 | + |
| 21 | + // Always add 'javr' as a tag |
| 22 | + sc.Tags = append(sc.Tags, `javr`) |
| 23 | + |
| 24 | + // Skipping some very generic and useless tags |
| 25 | + skiptags := map[string]bool{ |
| 26 | + "featured actress": true, |
| 27 | + "vr exclusive": true, |
| 28 | + "high-quality vr": true, |
| 29 | + "hi-def": true, |
| 30 | + "exclusive distribution": true, |
| 31 | + } |
| 32 | + |
| 33 | + // Cast |
| 34 | + html.ForEach("h2.subhead", func(id int, h2 *colly.HTMLElement) { |
| 35 | + if h2.Text == "Featured Idols" { |
| 36 | + dom := h2.DOM |
| 37 | + parent := dom.Parent() |
| 38 | + if parent != nil { |
| 39 | + parent.Find("a").Each(func(i int, anchor *goquery.Selection) { |
| 40 | + if anchor.Text() != "" { |
| 41 | + sc.Cast = append(sc.Cast, anchor.Text()) |
| 42 | + } |
| 43 | + }) |
| 44 | + } |
| 45 | + } |
| 46 | + }) |
| 47 | + |
| 48 | + html.ForEach(`div.movietable tr`, func(id int, tr *colly.HTMLElement) { |
| 49 | + label := tr.ChildText(`td.tablelabel`) |
| 50 | + |
| 51 | + if label == `Studio:` { |
| 52 | + // Studio |
| 53 | + sc.Studio = tr.ChildText(`td.tablevalue > span`) |
| 54 | + |
| 55 | + } else if label == `DVD ID:` { |
| 56 | + // Title, SceneID and SiteID all like 'VRKM-821' format |
| 57 | + dvdId := strings.ToUpper(tr.ChildText(`td.tablevalue`)) |
| 58 | + sc.Title = dvdId |
| 59 | + sc.SceneID = dvdId |
| 60 | + sc.SiteID = dvdId |
| 61 | + |
| 62 | + // Set 'Site' to first part of the ID (e.g. `VRKM for `vrkm-821`) |
| 63 | + siteParts := strings.Split(dvdId, `-`) |
| 64 | + if len(siteParts) > 0 { |
| 65 | + sc.Site = siteParts[0] |
| 66 | + } |
| 67 | + |
| 68 | + } else if label == `Release Date:` { |
| 69 | + // Release date |
| 70 | + dateStr := tr.ChildText(`td.tablevalue`) |
| 71 | + tmpDate, _ := goment.New(strings.TrimSpace(dateStr), "YYYY-MM-DD") |
| 72 | + sc.Released = tmpDate.Format("YYYY-MM-DD") |
| 73 | + |
| 74 | + } else if label == `Genre(s):` { |
| 75 | + // Tags |
| 76 | + /* NOTE: |
| 77 | + "Tags are technically incomplete vs. what you'd get translating dmm.co.jp |
| 78 | + tags/correlating them back to their old equivalents on r18 using something |
| 79 | + like Javinizer's tag CSV" |
| 80 | + */ |
| 81 | + tr.ForEach("a", func(id int, anchor *colly.HTMLElement) { |
| 82 | + href := anchor.Attr("href") |
| 83 | + if strings.Contains(href, "javdatabase.com/genres/") { |
| 84 | + // Tags |
| 85 | + tag := strings.ToLower(anchor.Text) |
| 86 | + |
| 87 | + if !skiptags[tag] { |
| 88 | + sc.Tags = append(sc.Tags, tag) |
| 89 | + } |
| 90 | + } |
| 91 | + }) |
| 92 | + |
| 93 | + } else if label == `Translated Title:` { |
| 94 | + // Synopsis / description |
| 95 | + sc.Synopsis = tr.ChildText(`td.tablevalue`) |
| 96 | + |
| 97 | + } else if label == `Content ID:` { |
| 98 | + contentId = tr.ChildText(`td.tablevalue`) |
| 99 | + sc.HomepageURL = `https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=` + contentId + `/` |
| 100 | + sc.Covers = append(sc.Covers, `https://pics.dmm.co.jp/digital/video/`+contentId+`/`+contentId+`pl.jpg`) |
| 101 | + } |
| 102 | + }) |
| 103 | + |
| 104 | + // Screenshots |
| 105 | + html.ForEach("a[href]", func(_ int, anchor *colly.HTMLElement) { |
| 106 | + linkHref := anchor.Attr(`href`) |
| 107 | + /* NOTE: |
| 108 | + it only pulls 6 gallery images, but that appears to be a limitation |
| 109 | + of how javdatabase.com is set up, they only pull 6 gallery images. |
| 110 | + */ |
| 111 | + if strings.HasPrefix(linkHref, "https://pics.dmm.co.jp/digital/video/") && strings.HasSuffix(linkHref, `.jpg`) { |
| 112 | + sc.Gallery = append(sc.Gallery, linkHref) |
| 113 | + } |
| 114 | + }) |
| 115 | + |
| 116 | + // Some specific postprocessing for error-correcting 3DSVR scenes |
| 117 | + if len(contentId) > 0 && sc.Site == "DSVR" { |
| 118 | + r := regexp.MustCompile("13dsvr0(\\d{4})") |
| 119 | + match := r.FindStringSubmatch(contentId) |
| 120 | + if match != nil && len(match) > 1 { |
| 121 | + // Found a 3DSVR scene that is being wrongly categorized as DSVR |
| 122 | + log.Println("Applying DSVR->3DSVR workaround") |
| 123 | + sid := match[1] |
| 124 | + sc.Site = "3DSVR" |
| 125 | + sc.SceneID = "3DSVR-" + sid |
| 126 | + sc.Title = sc.SceneID |
| 127 | + sc.SiteID = sc.SceneID |
| 128 | + } |
| 129 | + } |
| 130 | + |
| 131 | + *out = append(*out, sc) |
| 132 | + }) |
| 133 | + |
| 134 | + // Allow comma-separated scene id's |
| 135 | + scenes := strings.Split(queryString, ",") |
| 136 | + for _, v := range scenes { |
| 137 | + sceneCollector.Visit("https://www.javdatabase.com/movies/" + strings.ToLower(v) + "/") |
| 138 | + } |
| 139 | + |
| 140 | + sceneCollector.Wait() |
| 141 | +} |
0 commit comments