Skip to content

Commit c067a49

Browse files
authored
scraper: javr: replace r18.com scene importer with javdatabase.com (#1067)
* javr: replace r18.com scene importer with javdatabase.com * javdatabase importer: - Use cover image from dmm.co.jp - Use high-res images for the gallery - Link to dmm.co.jp instead of javdatabase.com (might be geoblocked though) - Change "Site" name to uppercase content-id prefix (e.g. "VRKM") - Add 'javr' as a tag - Prevent some variable-shadowing (no functional change) * javdatabase importer: - add one tag to the exclusion list - error-correcting code for 3DSVR scenes being categorized as DSVR * javdatabase importer: fix importing after website layout changes
1 parent c08297a commit c067a49

File tree

3 files changed

+145
-4
lines changed

3 files changed

+145
-4
lines changed

pkg/scrape/javdatabase.go

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
package scrape
2+
3+
import (
4+
"regexp"
5+
"strings"
6+
7+
"github.com/PuerkitoBio/goquery"
8+
"github.com/gocolly/colly"
9+
"github.com/nleeper/goment"
10+
"github.com/xbapps/xbvr/pkg/models"
11+
)
12+
13+
func ScrapeJavDB(knownScenes []string, out *[]models.ScrapedScene, queryString string) {
14+
sceneCollector := createCollector("www.javdatabase.com")
15+
16+
sceneCollector.OnHTML(`html`, func(html *colly.HTMLElement) {
17+
sc := models.ScrapedScene{}
18+
sc.SceneType = "VR"
19+
contentId := ""
20+
21+
// Always add 'javr' as a tag
22+
sc.Tags = append(sc.Tags, `javr`)
23+
24+
// Skipping some very generic and useless tags
25+
skiptags := map[string]bool{
26+
"featured actress": true,
27+
"vr exclusive": true,
28+
"high-quality vr": true,
29+
"hi-def": true,
30+
"exclusive distribution": true,
31+
}
32+
33+
// Cast
34+
html.ForEach("h2.subhead", func(id int, h2 *colly.HTMLElement) {
35+
if h2.Text == "Featured Idols" {
36+
dom := h2.DOM
37+
parent := dom.Parent()
38+
if parent != nil {
39+
parent.Find("a").Each(func(i int, anchor *goquery.Selection) {
40+
if anchor.Text() != "" {
41+
sc.Cast = append(sc.Cast, anchor.Text())
42+
}
43+
})
44+
}
45+
}
46+
})
47+
48+
html.ForEach(`div.movietable tr`, func(id int, tr *colly.HTMLElement) {
49+
label := tr.ChildText(`td.tablelabel`)
50+
51+
if label == `Studio:` {
52+
// Studio
53+
sc.Studio = tr.ChildText(`td.tablevalue > span`)
54+
55+
} else if label == `DVD ID:` {
56+
// Title, SceneID and SiteID all like 'VRKM-821' format
57+
dvdId := strings.ToUpper(tr.ChildText(`td.tablevalue`))
58+
sc.Title = dvdId
59+
sc.SceneID = dvdId
60+
sc.SiteID = dvdId
61+
62+
// Set 'Site' to first part of the ID (e.g. `VRKM for `vrkm-821`)
63+
siteParts := strings.Split(dvdId, `-`)
64+
if len(siteParts) > 0 {
65+
sc.Site = siteParts[0]
66+
}
67+
68+
} else if label == `Release Date:` {
69+
// Release date
70+
dateStr := tr.ChildText(`td.tablevalue`)
71+
tmpDate, _ := goment.New(strings.TrimSpace(dateStr), "YYYY-MM-DD")
72+
sc.Released = tmpDate.Format("YYYY-MM-DD")
73+
74+
} else if label == `Genre(s):` {
75+
// Tags
76+
/* NOTE:
77+
"Tags are technically incomplete vs. what you'd get translating dmm.co.jp
78+
tags/correlating them back to their old equivalents on r18 using something
79+
like Javinizer's tag CSV"
80+
*/
81+
tr.ForEach("a", func(id int, anchor *colly.HTMLElement) {
82+
href := anchor.Attr("href")
83+
if strings.Contains(href, "javdatabase.com/genres/") {
84+
// Tags
85+
tag := strings.ToLower(anchor.Text)
86+
87+
if !skiptags[tag] {
88+
sc.Tags = append(sc.Tags, tag)
89+
}
90+
}
91+
})
92+
93+
} else if label == `Translated Title:` {
94+
// Synopsis / description
95+
sc.Synopsis = tr.ChildText(`td.tablevalue`)
96+
97+
} else if label == `Content ID:` {
98+
contentId = tr.ChildText(`td.tablevalue`)
99+
sc.HomepageURL = `https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=` + contentId + `/`
100+
sc.Covers = append(sc.Covers, `https://pics.dmm.co.jp/digital/video/`+contentId+`/`+contentId+`pl.jpg`)
101+
}
102+
})
103+
104+
// Screenshots
105+
html.ForEach("a[href]", func(_ int, anchor *colly.HTMLElement) {
106+
linkHref := anchor.Attr(`href`)
107+
/* NOTE:
108+
it only pulls 6 gallery images, but that appears to be a limitation
109+
of how javdatabase.com is set up, they only pull 6 gallery images.
110+
*/
111+
if strings.HasPrefix(linkHref, "https://pics.dmm.co.jp/digital/video/") && strings.HasSuffix(linkHref, `.jpg`) {
112+
sc.Gallery = append(sc.Gallery, linkHref)
113+
}
114+
})
115+
116+
// Some specific postprocessing for error-correcting 3DSVR scenes
117+
if len(contentId) > 0 && sc.Site == "DSVR" {
118+
r := regexp.MustCompile("13dsvr0(\\d{4})")
119+
match := r.FindStringSubmatch(contentId)
120+
if match != nil && len(match) > 1 {
121+
// Found a 3DSVR scene that is being wrongly categorized as DSVR
122+
log.Println("Applying DSVR->3DSVR workaround")
123+
sid := match[1]
124+
sc.Site = "3DSVR"
125+
sc.SceneID = "3DSVR-" + sid
126+
sc.Title = sc.SceneID
127+
sc.SiteID = sc.SceneID
128+
}
129+
}
130+
131+
*out = append(*out, sc)
132+
})
133+
134+
// Allow comma-separated scene id's
135+
scenes := strings.Split(queryString, ",")
136+
for _, v := range scenes {
137+
sceneCollector.Visit("https://www.javdatabase.com/movies/" + strings.ToLower(v) + "/")
138+
}
139+
140+
sceneCollector.Wait()
141+
}

pkg/tasks/content.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -290,8 +290,8 @@ func ScrapeJAVR(queryString string) {
290290
// Start scraping
291291
var collectedScenes []models.ScrapedScene
292292

293-
tlog.Infof("Scraping R18")
294-
scrape.ScrapeR18(knownScenes, &collectedScenes, queryString)
293+
tlog.Infof("Scraping JavDB")
294+
scrape.ScrapeJavDB(knownScenes, &collectedScenes, queryString)
295295

296296
if len(collectedScenes) > 0 {
297297
db, _ := models.GetDB()

ui/src/views/options/sections/OptionsSceneCreate.vue

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
<template>
22
<div class="content">
3-
<h3 class="title">{{$t('Import JAVR scene from R18')}}</h3>
3+
<h3 class="title">{{$t('Import JAVR scene from javdatabase.com')}}</h3>
44
<div class="card">
55
<div class="card-content content">
66
<b-field grouped>
7-
<b-input v-model="javrQuery" placeholder="URL or ID (XXXX-001)" type="search"></b-input>
7+
<b-input v-model="javrQuery" placeholder="ID (xxxx-001)" type="search"></b-input>
88
<b-button class="button is-primary" v-on:click="scrapeJAVR()">{{$t('Go')}}</b-button>
99
</b-field>
1010
</div>

0 commit comments

Comments
 (0)