Skip to content

Commit f235274

Browse files
committed

File tree

1 file changed

+32
-47
lines changed

1 file changed

+32
-47
lines changed

app/src/Commands/FeedProcessor.php

Lines changed: 32 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1180,6 +1180,7 @@ private function processPaginatedXmlFeed(array $feed, \SimpleXMLElement $xml, &$
11801180
}
11811181
/**
11821182
* Check if content is Substack subscriber-only
1183+
* Detects the "Read more" link pattern at the end of truncated content
11831184
*/
11841185
private function isSubstackSubscriberOnly(string $url, ?string $content): bool
11851186
{
@@ -1191,56 +1192,40 @@ private function isSubstackSubscriberOnly(string $url, ?string $content): bool
11911192
return false;
11921193
}
11931194

1194-
$subscriberPatterns = [
1195-
// English
1196-
'/This is exclusive content for subscribers/i',
1197-
'/This post is for paid subscribers/i',
1198-
'/This post is for paying subscribers/i',
1199-
'/Subscribe to keep reading/i',
1200-
'/Subscribe now to continue reading/i',
1201-
'/Upgrade to paid/i',
1202-
'/Subscribe to read the full story/i',
1203-
'/This is a preview/i.*subscribe/i',
1204-
'/Get \d+% off for \d+ year/i',
1205-
'/Upgrade your subscription/i',
1206-
'/subscribers only/i',
1207-
'/Subscribe to unlock/i',
1208-
'/Already a paying subscriber/i',
1209-
'/Become a paid subscriber/i',
1210-
1211-
// Spanish
1212-
'/Este es contenido exclusivo para suscriptores/i',
1213-
'/Este contenido es para suscriptores/i',
1214-
'/Esta publicación es para suscriptores/i',
1215-
'/Suscríbete para seguir leyendo/i',
1216-
'/Suscríbete para continuar leyendo/i',
1217-
'/Actualiza a suscripción de pago/i',
1218-
'/Suscríbete para leer la historia completa/i',
1219-
'/solo para suscriptores/i',
1220-
'/Suscríbete para desbloquear/i',
1221-
'/Conviértete en suscriptor de pago/i',
1222-
'/Actualiza tu suscripción/i',
1223-
'/contenido exclusivo para suscriptores/i',
1224-
1225-
// Portuguese
1226-
'/Este é um conteúdo exclusivo para os assinantes/i',
1227-
'/Este conteúdo é para assinantes/i',
1228-
'/Esta publicação é para assinantes/i',
1229-
'/Assine para continuar lendo/i',
1230-
'/Assine agora para continuar lendo/i',
1231-
'/Atualize para assinatura paga/i',
1232-
'/Assine para ler a história completa/i',
1233-
'/apenas para assinantes/i',
1234-
'/Assine para desbloquear/i',
1235-
'/Torne-se um assinante pago/i',
1236-
'/Atualize sua assinatura/i',
1237-
'/conteúdo exclusivo para assinantes/i',
1238-
'/somente assinantes/i'
1195+
// Remove whitespace and get the last 500 characters for checking
1196+
$trimmedContent = trim($content);
1197+
$endContent = substr($trimmedContent, -500);
1198+
1199+
// Pattern 1: Check for "Read more" link pointing to substack.com at the end
1200+
// Example: <p> <a href="https://example.substack.com/p/post-name"> Read more </a> </p>
1201+
$readMorePattern = '/<p>\s*<a\s+href=["\']https?:\/\/[^"\']*\.?substack\.com[^"\']*["\']>\s*Read more\s*<\/a>\s*<\/p>\s*$/i';
1202+
1203+
if (preg_match($readMorePattern, $endContent)) {
1204+
$this->climate->whisper("Detected Substack subscriber-only content (Read more link) in: {$url}");
1205+
return true;
1206+
}
1207+
1208+
// Pattern 2: Alternative "Read more" patterns without strict spacing
1209+
$readMorePattern2 = '/<p[^>]*>\s*<a[^>]+href=["\']https?:\/\/[^"\']*\.?substack\.com[^"\']*["\'][^>]*>\s*Read\s+more\s*<\/a>\s*<\/p>\s*$/i';
1210+
1211+
if (preg_match($readMorePattern2, $endContent)) {
1212+
$this->climate->whisper("Detected Substack subscriber-only content (Read more link alt) in: {$url}");
1213+
return true;
1214+
}
1215+
1216+
// Pattern 3: Check for common subscriber-only content indicators as fallback
1217+
$subscriberIndicators = [
1218+
'/Este (?:é um )?conteúdo exclusivo para (?:os )?assinantes/i',
1219+
'/This is (?:a |an )?(?:exclusive )?content for (?:paid )?subscribers/i',
1220+
'/Este(?:s)? (?:es|é) contenido exclusivo para suscriptores/i',
1221+
'/Subscribe (?:now )?to (?:keep |continue )?reading/i',
1222+
'/Assine (?:agora )?para (?:continuar |seguir )?lendo/i',
1223+
'/Suscr[ií]bete (?:ahora )?para (?:seguir |continuar )?leyendo/i'
12391224
];
12401225

1241-
foreach ($subscriberPatterns as $pattern) {
1226+
foreach ($subscriberIndicators as $pattern) {
12421227
if (preg_match($pattern, $content)) {
1243-
$this->climate->whisper("Detected Substack subscriber-only content in: {$url}");
1228+
$this->climate->whisper("Detected Substack subscriber-only content (text indicator) in: {$url}");
12441229
return true;
12451230
}
12461231
}

0 commit comments

Comments
 (0)