Skip to content

Commit 12f6ea8

Browse files
committed
Add safety checks when walking stack
1 parent 0049066 commit 12f6ea8

File tree

3 files changed

+246
-44
lines changed

3 files changed

+246
-44
lines changed

ext/libdatadog_api/crashtracker.c

Lines changed: 106 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@
3434

3535
#include <datadog/crashtracker.h>
3636
#include "datadog_ruby_common.h"
37+
#include <sys/mman.h>
38+
#include <unistd.h>
39+
#include <errno.h>
3740

3841
// Include profiling stack walking functionality
3942
// Note: rb_iseq_path and rb_iseq_base_label are already declared in MJIT header
@@ -55,7 +58,78 @@ static void ruby_runtime_stack_callback(
5558

5659
static bool first_init = true;
5760

58-
// Helper functions for stack walking will be added here when we implement full stack walking
61+
// Safety checks for signal-safe stack walking
62+
static bool is_pointer_readable(const void *ptr, size_t size) {
63+
if (!ptr) return false;
64+
65+
// This is signal-safe and doesn't allocate memory
66+
size_t page_size = getpagesize();
67+
void *aligned_ptr = (void*)((uintptr_t)ptr & ~(page_size - 1));
68+
size_t pages = ((char*)ptr + size - (char*)aligned_ptr + page_size - 1) / page_size;
69+
70+
// Stack-allocate a small buffer for mincore results.. should be safe?
71+
char vec[16]; // Support up to 16 pages (64KB on 4K page systems)
72+
if (pages > 16) return false; // Too big to check safely
73+
74+
return mincore(aligned_ptr, pages * page_size, vec) == 0;
75+
}
76+
77+
static bool is_reasonable_string_size(VALUE str) {
78+
if (str == Qnil) return false;
79+
80+
long len = RSTRING_LEN(str);
81+
82+
// Sanity checks for corrupted string lengths
83+
if (len < 0) return false; // Negative length, probably corrupted
84+
if (len > 1024) return false; // > 1KB path/function name, suspicious for crash context
85+
86+
return true;
87+
}
88+
89+
static const char* safe_string_ptr(VALUE str) {
90+
if (str == Qnil) return "<nil>";
91+
92+
long len = RSTRING_LEN(str);
93+
if (len < 0 || len > 2048) return "<corrupted>";
94+
95+
const char *ptr = RSTRING_PTR(str);
96+
if (!ptr) return "<null>";
97+
98+
if (!is_pointer_readable(ptr, len)) return "<unreadable>";
99+
100+
return ptr;
101+
}
102+
103+
static bool is_valid_control_frame(const rb_control_frame_t *cfp,
104+
const rb_execution_context_t *ec) {
105+
if (!cfp) return false;
106+
107+
void *stack_start = ec->vm_stack;
108+
void *stack_end = (char*)stack_start + ec->vm_stack_size * sizeof(VALUE);
109+
if ((void*)cfp < stack_start || (void*)cfp >= stack_end) {
110+
return false;
111+
}
112+
113+
if (!is_pointer_readable(cfp, sizeof(rb_control_frame_t))) {
114+
return false;
115+
}
116+
117+
return true;
118+
}
119+
120+
static bool is_valid_iseq(const rb_iseq_t *iseq) {
121+
if (!iseq) return false;
122+
if (!is_pointer_readable(iseq, sizeof(rb_iseq_t))) return false;
123+
124+
// Check iseq body
125+
if (!iseq->body) return false;
126+
if (!is_pointer_readable(iseq->body, sizeof(*iseq->body))) return false;
127+
128+
// Validate iseq size
129+
if (iseq->body->iseq_size > 100000) return false; // > 100K instructions, suspicious
130+
131+
return true;
132+
}
59133

60134
// Used to report Ruby VM crashes.
61135
// Once initialized, segfaults will be reported automatically using libdatadog.
@@ -175,28 +249,13 @@ static VALUE _native_stop(DDTRACE_UNUSED VALUE _self) {
175249
return Qtrue;
176250
}
177251

178-
// Ruby runtime stack callback implementation
179-
// This function will be called by libdatadog during crash handling
252+
180253
static void ruby_runtime_stack_callback(
181254
void (*emit_frame)(const ddog_crasht_RuntimeStackFrame*),
182255
void (*emit_stacktrace_string)(const char*)
183256
) {
184-
// Only using emit_frame - ignore emit_stacktrace_string parameter
185257
(void)emit_stacktrace_string;
186258

187-
// Try to safely walk Ruby stack using direct VM structure access
188-
// Avoid Ruby API calls that might hang in crash context
189-
190-
// First emit a marker frame to show callback is working
191-
ddog_crasht_RuntimeStackFrame marker_frame = {
192-
.function_name = "ruby_stack_walker_start",
193-
.file_name = "crashtracker.c",
194-
.line_number = 1,
195-
.column_number = 0
196-
};
197-
emit_frame(&marker_frame);
198-
199-
// Try to get current thread and execution context safely
200259
VALUE current_thread = rb_thread_current();
201260
if (current_thread == Qnil) return;
202261

@@ -213,7 +272,6 @@ static void ruby_runtime_stack_callback(
213272
const rb_execution_context_t *ec = th->ec;
214273
if (!ec) return;
215274

216-
// Safety checks
217275
if (th->status == THREAD_KILLED) return;
218276
if (!ec->vm_stack || ec->vm_stack_size == 0) return;
219277

@@ -222,40 +280,56 @@ static void ruby_runtime_stack_callback(
222280

223281
if (!cfp || !end_cfp) return;
224282

225-
// Skip dummy frames
226283
end_cfp = RUBY_VM_NEXT_CONTROL_FRAME(end_cfp);
227284
if (end_cfp <= cfp) return;
228285

229-
// Walk stack frames with extreme caution
286+
const rb_control_frame_t *top_sentinel = RUBY_VM_NEXT_CONTROL_FRAME(cfp);
287+
288+
cfp = RUBY_VM_NEXT_CONTROL_FRAME(end_cfp);
289+
230290
int frame_count = 0;
231-
const int MAX_FRAMES = 10; // Keep very low to minimize crash risk
291+
const int MAX_FRAMES = 20;
232292

233-
for (; cfp != RUBY_VM_NEXT_CONTROL_FRAME(end_cfp) && frame_count < MAX_FRAMES;
234-
cfp = RUBY_VM_NEXT_CONTROL_FRAME(cfp)) {
293+
for (; cfp != top_sentinel && frame_count < MAX_FRAMES; cfp = RUBY_VM_NEXT_CONTROL_FRAME(cfp)) {
294+
if (!is_valid_control_frame(cfp, ec)) {
295+
continue; // Skip invalid frames
296+
}
297+
298+
if (cfp->iseq && !cfp->pc) {
299+
continue;
300+
}
235301

236302
if (VM_FRAME_RUBYFRAME_P(cfp) && cfp->iseq) {
237-
// Instead of calling rb_iseq_* functions, work directly with iseq struct
238303
const rb_iseq_t *iseq = cfp->iseq;
239304

240-
// Basic frame info without risky API calls
241-
char frame_desc[64];
242-
snprintf(frame_desc, sizeof(frame_desc), "ruby_frame_%d", frame_count);
305+
if (!iseq || !iseq->body) {
306+
continue;
307+
}
308+
309+
VALUE name = rb_iseq_base_label(iseq);
310+
const char *function_name = (name != Qnil) ? safe_string_ptr(name) : "<unknown>";
311+
312+
VALUE filename = rb_iseq_path(iseq);
313+
const char *file_name = (filename != Qnil) ? safe_string_ptr(filename) : "<unknown>";
243314

244-
// Try to get basic line info safely using direct struct access
245315
int line_no = 0;
246316
if (iseq && cfp->pc) {
247-
// Use direct access to iseq body instead of helper functions
248317
if (iseq->body && iseq->body->iseq_encoded && iseq->body->iseq_size > 0) {
249318
ptrdiff_t pc_offset = cfp->pc - iseq->body->iseq_encoded;
250319
if (pc_offset >= 0 && pc_offset < iseq->body->iseq_size) {
251-
line_no = frame_count + 1; // Use frame position as approximation
320+
// Use the Ruby VM line calculation like ddtrace_rb_profile_frames
321+
size_t pos = pc_offset;
322+
if (pos > 0) {
323+
pos--; // Use pos-1 because PC points next instruction
324+
}
325+
line_no = rb_iseq_line_no(iseq, pos);
252326
}
253327
}
254328
}
255329

256330
ddog_crasht_RuntimeStackFrame frame = {
257-
.function_name = frame_desc,
258-
.file_name = "ruby_source.rb",
331+
.function_name = function_name,
332+
.file_name = file_name,
259333
.line_number = line_no,
260334
.column_number = 0
261335
};
@@ -264,21 +338,11 @@ static void ruby_runtime_stack_callback(
264338
frame_count++;
265339
}
266340
}
267-
268-
// Emit end marker
269-
ddog_crasht_RuntimeStackFrame end_frame = {
270-
.function_name = "ruby_stack_walker_end",
271-
.file_name = "crashtracker.c",
272-
.line_number = frame_count,
273-
.column_number = 0
274-
};
275-
emit_frame(&end_frame);
276341
}
277342

278343
static VALUE _native_register_runtime_stack_callback(DDTRACE_UNUSED VALUE _self, VALUE callback_type) {
279344
ENFORCE_TYPE(callback_type, T_SYMBOL);
280345

281-
// Verify we're using the frame type (should always be :frame)
282346
VALUE frame_symbol = ID2SYM(rb_intern("frame"));
283347
if (callback_type != frame_symbol) {
284348
rb_raise(rb_eArgError, "Invalid callback_type. Only :frame is supported");

spec/datadog/core/crashtracking/component_spec.rb

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@
240240
crash_tracker.start
241241
trigger.call
242242
end
243-
puts(crash_report_experimental)
243+
puts(crash_report_experimental.to_json)
244244
puts(log_messages)
245245
expect(stack_trace).to match(array_including(hash_including(function: function)))
246246
expect(stack_trace.size).to be > 10
@@ -348,7 +348,6 @@
348348
end
349349
end
350350

351-
352351
describe '#runtime_callback_registered?' do
353352
it 'returns true when callback is registered' do
354353
crashtracker = build_crashtracker(logger: logger)

test_crashtracker_standalone.rb

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
#!/usr/bin/env ruby
2+
3+
require 'json'
4+
require 'webrick'
5+
require 'fiddle'
6+
require 'datadog'
7+
8+
# Simple HTTP server to capture crash reports
9+
def start_crash_server(port)
10+
server = WEBrick::HTTPServer.new(
11+
Port: port,
12+
Logger: WEBrick::Log.new(File.open(File::NULL, "w")),
13+
AccessLog: []
14+
)
15+
16+
crash_report = nil
17+
18+
server.mount_proc '/' do |req, res|
19+
if req.request_method == 'POST'
20+
body = req.body
21+
crash_report = JSON.parse(body) rescue body
22+
puts "=== CRASH REPORT RECEIVED ==="
23+
puts JSON.pretty_generate(crash_report) if crash_report.is_a?(Hash)
24+
puts "=== END CRASH REPORT ==="
25+
end
26+
res.body = '{}'
27+
end
28+
29+
Thread.new { server.start }
30+
[server, proc { crash_report }]
31+
end
32+
33+
# Nested Ruby functions to create a meaningful stack
34+
def level_5
35+
puts "In level_5 - about to crash"
36+
# Trigger segfault
37+
Fiddle.free(42)
38+
end
39+
40+
def level_4
41+
puts "In level_4"
42+
level_5
43+
end
44+
45+
def level_3
46+
puts "In level_3"
47+
level_4
48+
end
49+
50+
def level_2
51+
puts "In level_2"
52+
level_3
53+
end
54+
55+
def level_1
56+
puts "In level_1"
57+
level_2
58+
end
59+
60+
def main_crash_test
61+
puts "Starting crash test with nested functions"
62+
level_1
63+
end
64+
65+
# Main test
66+
puts "Starting standalone crashtracker test..."
67+
68+
# Start server
69+
server, get_crash_report = start_crash_server(9999)
70+
sleep 0.1 # Let server start
71+
72+
puts "Forking process to test crashtracker..."
73+
74+
pid = fork do
75+
begin
76+
puts "Child process started"
77+
78+
# Configure crashtracker
79+
Datadog.configure do |c|
80+
c.agent.host = '127.0.0.1'
81+
c.agent.port = 9999
82+
end
83+
84+
puts "Crashtracker configured, starting crash test..."
85+
86+
# Call our nested function that will crash
87+
main_crash_test
88+
89+
rescue => e
90+
puts "Unexpected error in child: #{e}"
91+
exit 1
92+
end
93+
end
94+
95+
# Wait for child process
96+
Process.wait(pid)
97+
puts "Child process finished with status: #{$?.exitstatus}"
98+
99+
# Give server time to receive the crash report
100+
sleep 1
101+
102+
# Get and save the crash report
103+
crash_report = get_crash_report.call
104+
if crash_report
105+
# Write full crash report to tmp file
106+
timestamp = Time.now.strftime("%Y%m%d_%H%M%S")
107+
crash_file = "/tmp/crashtracker_report_#{timestamp}.json"
108+
File.write(crash_file, JSON.pretty_generate(crash_report))
109+
puts "\n=== CRASH REPORT SAVED ==="
110+
puts "Full crash report written to: #{crash_file}"
111+
112+
puts "\n=== RUNTIME STACK ANALYSIS ==="
113+
if crash_report.is_a?(Hash) && crash_report.dig('payload', 0, 'message')
114+
message = JSON.parse(crash_report.dig('payload', 0, 'message'))
115+
runtime_stack = message['experimental']['runtime_stack']
116+
if runtime_stack
117+
puts "Runtime stack format: #{runtime_stack['format']}"
118+
puts "Number of frames captured: #{runtime_stack['frames']&.length || 0}"
119+
puts "\nStack frames:"
120+
runtime_stack['frames']&.each_with_index do |frame, i|
121+
puts " [#{i}] #{frame['function']} @ #{frame['file']}:#{frame['line']}"
122+
end
123+
124+
runtime_stack_file = "/tmp/runtime_stack_#{timestamp}.json"
125+
File.write(runtime_stack_file, JSON.pretty_generate(runtime_stack))
126+
puts "\nRuntime stack saved to: #{runtime_stack_file}"
127+
else
128+
puts "No runtime_stack found in crash report"
129+
end
130+
else
131+
puts "Could not parse crash report structure"
132+
end
133+
else
134+
puts "No crash report received"
135+
end
136+
137+
# Cleanup
138+
server.shutdown
139+
puts "\nTest complete."

0 commit comments

Comments
 (0)