Skip to content

Commit bf90130

Browse files
committed
Support schemeless matching
1 parent da7fc94 commit bf90130

File tree

2 files changed

+75
-15
lines changed

2 files changed

+75
-15
lines changed

src/__tests__/unit/checks/keywords-urls.test.ts

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,50 @@ describe('urls guardrail', () => {
340340
);
341341
expect(result.info?.blocked).toContain('https://example.com/other');
342342
});
343+
344+
it('matches scheme-less URLs against scheme-qualified allow list entries', async () => {
345+
// Test exact behavior: scheme-qualified allow list vs scheme-less/explicit URLs
346+
const config = {
347+
url_allow_list: ['https://suntropy.es'],
348+
allowed_schemes: new Set(['https']),
349+
allow_subdomains: false,
350+
block_userinfo: true,
351+
};
352+
353+
// Test scheme-less URL (should be allowed)
354+
const result1 = await urls({}, 'Visit suntropy.es', config);
355+
expect(result1.info?.allowed).toContain('suntropy.es');
356+
expect(result1.tripwireTriggered).toBe(false);
357+
358+
// Test HTTPS URL (should match allow list scheme)
359+
const result2 = await urls({}, 'Visit https://suntropy.es', config);
360+
expect(result2.info?.allowed).toContain('https://suntropy.es');
361+
expect(result2.tripwireTriggered).toBe(false);
362+
363+
// Test HTTP URL (wrong explicit scheme should be blocked)
364+
const result3 = await urls({}, 'Visit http://suntropy.es', config);
365+
expect(result3.info?.blocked).toContain('http://suntropy.es');
366+
expect(result3.tripwireTriggered).toBe(true);
367+
});
368+
369+
it('blocks subdomains and paths correctly with scheme-qualified allow list', async () => {
370+
// Verify subdomains and paths are still blocked according to allow list rules
371+
const config = {
372+
url_allow_list: ['https://suntropy.es'],
373+
allowed_schemes: new Set(['https']),
374+
allow_subdomains: false,
375+
block_userinfo: true,
376+
};
377+
378+
const text = 'Visit help-suntropy.es and help.suntropy.es';
379+
const result = await urls({}, text, config);
380+
381+
// Both should be blocked - not in allow list
382+
expect(result.tripwireTriggered).toBe(true);
383+
expect(result.info?.blocked).toHaveLength(2);
384+
expect(result.info?.blocked).toContain('help-suntropy.es');
385+
expect(result.info?.blocked).toContain('help.suntropy.es');
386+
});
343387
});
344388

345389
describe('competitors guardrail', () => {

src/checks/urls.ts

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -243,58 +243,63 @@ function detectUrls(text: string): string[] {
243243
function validateUrlSecurity(
244244
urlString: string,
245245
config: UrlsConfig
246-
): { parsedUrl: URL | null; reason: string } {
246+
): { parsedUrl: URL | null; reason: string; hadScheme: boolean } {
247247
try {
248248
let parsedUrl: URL;
249249
let originalScheme: string;
250+
let hadScheme: boolean;
250251

251252
// Parse URL - preserve original scheme for validation
252253
if (urlString.includes('://')) {
253254
// Standard URL with double-slash scheme (http://, https://, ftp://, etc.)
254255
parsedUrl = new URL(urlString);
255256
originalScheme = parsedUrl.protocol.replace(/:$/, '');
257+
hadScheme = true;
256258
} else if (
257259
urlString.includes(':') &&
258260
urlString.split(':', 1)[0].match(/^(data|javascript|vbscript|mailto)$/)
259261
) {
260262
// Special single-colon schemes
261263
parsedUrl = new URL(urlString);
262264
originalScheme = parsedUrl.protocol.replace(/:$/, '');
265+
hadScheme = true;
263266
} else {
264267
// Add http scheme for parsing, but remember this is a default
265268
parsedUrl = new URL(`http://${urlString}`);
266269
originalScheme = 'http'; // Default scheme for scheme-less URLs
270+
hadScheme = false;
267271
}
268272

269273
// Basic validation: must have scheme and hostname (except for special schemes)
270274
if (!parsedUrl.protocol) {
271-
return { parsedUrl: null, reason: 'Invalid URL format' };
275+
return { parsedUrl: null, reason: 'Invalid URL format', hadScheme: false };
272276
}
273277

274278
// Special schemes like data: and javascript: don't need hostname
275279
const parsedScheme = parsedUrl.protocol.replace(/:$/, '').toLowerCase();
276280
if (!HOSTLESS_SCHEMES.has(parsedScheme) && !parsedUrl.hostname) {
277-
return { parsedUrl: null, reason: 'Invalid URL format' };
281+
return { parsedUrl: null, reason: 'Invalid URL format', hadScheme };
278282
}
279283

280284
// Security validations - use original scheme
285+
// Only check allowed_schemes if the URL explicitly had a scheme
281286
const normalizedScheme = originalScheme.toLowerCase();
282287

283-
if (!config.allowed_schemes.has(normalizedScheme)) {
284-
return { parsedUrl: null, reason: `Blocked scheme: ${normalizedScheme}` };
288+
if (hadScheme && !config.allowed_schemes.has(normalizedScheme)) {
289+
return { parsedUrl: null, reason: `Blocked scheme: ${normalizedScheme}`, hadScheme };
285290
}
286291

287292
if (config.block_userinfo && (parsedUrl.username || parsedUrl.password)) {
288-
return { parsedUrl: null, reason: 'Contains userinfo (potential credential injection)' };
293+
return { parsedUrl: null, reason: 'Contains userinfo (potential credential injection)', hadScheme };
289294
}
290295

291296
// Everything else (IPs, localhost, private IPs) goes through allow list logic
292-
return { parsedUrl, reason: '' };
297+
return { parsedUrl, reason: '', hadScheme };
293298
} catch (error) {
294299
// Provide specific error information for debugging
295300
const errorName = error instanceof Error ? error.name : 'Error';
296301
const errorMessage = error instanceof Error ? error.message : String(error);
297-
return { parsedUrl: null, reason: `URL parsing error: ${errorName}: ${errorMessage}` };
302+
return { parsedUrl: null, reason: `URL parsing error: ${errorName}: ${errorMessage}`, hadScheme: false };
298303
}
299304
}
300305

@@ -328,8 +333,13 @@ function isIpv4Address(value: string): boolean {
328333

329334
/**
330335
* Check if URL is allowed based on the allow list configuration.
336+
*
337+
* @param parsedUrl - The parsed URL to check
338+
* @param allowList - List of allowed URL patterns
339+
* @param allowSubdomains - Whether to allow subdomains
340+
* @param hadScheme - Whether the original URL had an explicit scheme
331341
*/
332-
function isUrlAllowed(parsedUrl: URL, allowList: string[], allowSubdomains: boolean): boolean {
342+
function isUrlAllowed(parsedUrl: URL, allowList: string[], allowSubdomains: boolean, hadScheme: boolean): boolean {
333343
if (allowList.length === 0) {
334344
return false;
335345
}
@@ -395,11 +405,14 @@ function isUrlAllowed(parsedUrl: URL, allowList: string[], allowSubdomains: bool
395405
continue;
396406
}
397407

398-
if (hasExplicitScheme && allowedScheme && allowedScheme !== schemeLower) {
408+
// Scheme matching for IPs: only enforce when BOTH allow list entry AND URL have explicit schemes
409+
if (hasExplicitScheme && hadScheme && allowedScheme && allowedScheme !== schemeLower) {
399410
continue;
400411
}
401412

402-
if (allowedPort !== null) {
413+
// Port matching: only enforce when allow list entry explicitly specifies a port
414+
// Check parsedAllowed.port (empty string when no port specified) not allowedPort (always has default)
415+
if (parsedAllowed.port) {
403416
if (urlPort === null || allowedPort !== urlPort) {
404417
continue;
405418
}
@@ -435,7 +448,9 @@ function isUrlAllowed(parsedUrl: URL, allowList: string[], allowSubdomains: bool
435448

436449
const allowedDomain = allowedHost.replace(/^www\./, '');
437450

438-
if (allowedPort !== null) {
451+
// Port matching: only enforce when allow list entry explicitly specifies a port
452+
// Check parsedAllowed.port (empty string when no port specified) not allowedPort (always has default)
453+
if (parsedAllowed.port) {
439454
if (urlPort === null || allowedPort !== urlPort) {
440455
continue;
441456
}
@@ -447,7 +462,8 @@ function isUrlAllowed(parsedUrl: URL, allowList: string[], allowSubdomains: bool
447462
continue;
448463
}
449464

450-
if (hasExplicitScheme && allowedScheme && allowedScheme !== schemeLower) {
465+
// Scheme matching for domains: only enforce when BOTH allow list entry AND URL have explicit schemes
466+
if (hasExplicitScheme && hadScheme && allowedScheme && allowedScheme !== schemeLower) {
451467
continue;
452468
}
453469

@@ -492,7 +508,7 @@ export const urls: CheckFn<UrlsContext, string, UrlsConfig> = async (ctx, data,
492508

493509
for (const urlString of detectedUrls) {
494510
// Validate URL with security checks
495-
const { parsedUrl, reason } = validateUrlSecurity(urlString, actualConfig);
511+
const { parsedUrl, reason, hadScheme } = validateUrlSecurity(urlString, actualConfig);
496512

497513
if (parsedUrl === null) {
498514
blocked.push(urlString);
@@ -509,7 +525,7 @@ export const urls: CheckFn<UrlsContext, string, UrlsConfig> = async (ctx, data,
509525
// They were already validated for scheme permission in validateUrlSecurity
510526
allowed.push(urlString);
511527
} else if (
512-
isUrlAllowed(parsedUrl, actualConfig.url_allow_list, actualConfig.allow_subdomains)
528+
isUrlAllowed(parsedUrl, actualConfig.url_allow_list, actualConfig.allow_subdomains, hadScheme)
513529
) {
514530
allowed.push(urlString);
515531
} else {

0 commit comments

Comments
 (0)