diff --git a/ext/datadog_profiling_native_extension/clock_id_from_pthread.c b/ext/datadog_profiling_native_extension/clock_id_from_pthread.c index be033110f73..69920a51832 100644 --- a/ext/datadog_profiling_native_extension/clock_id_from_pthread.c +++ b/ext/datadog_profiling_native_extension/clock_id_from_pthread.c @@ -11,6 +11,7 @@ #include "clock_id.h" #include "helpers.h" #include "private_vm_api_access.h" +#include "ruby_helpers.h" #include "time_helpers.h" // Validate that our home-cooked pthread_id_for() matches pthread_self() for the current thread @@ -18,7 +19,7 @@ void self_test_clock_id(void) { rb_nativethread_id_t expected_pthread_id = pthread_self(); rb_nativethread_id_t actual_pthread_id = pthread_id_for(rb_thread_current()); - if (expected_pthread_id != actual_pthread_id) rb_raise(rb_eRuntimeError, "pthread_id_for() self-test failed"); + if (expected_pthread_id != actual_pthread_id) raise_error(eNativeRuntimeError, "pthread_id_for() self-test failed"); } // Safety: This function is assumed never to raise exceptions by callers diff --git a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c index 022c4588705..b791af779fb 100644 --- a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +++ b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c @@ -289,7 +289,7 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) { after_gc_from_postponed_job_handle == POSTPONED_JOB_HANDLE_INVALID || after_gvl_running_from_postponed_job_handle == POSTPONED_JOB_HANDLE_INVALID ) { - rb_raise(rb_eRuntimeError, "Failed to register profiler postponed jobs (got POSTPONED_JOB_HANDLE_INVALID)"); + raise_error(eNativeRuntimeError, "Failed to register profiler postponed jobs (got POSTPONED_JOB_HANDLE_INVALID)"); } #else gc_finalize_deferred_workaround = objspace_ptr_for_gc_finalize_deferred_workaround(); @@ -472,10 +472,7 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) { cpu_and_wall_time_worker_state *old_state = active_sampler_instance_state; if (old_state != NULL) { if (is_thread_alive(old_state->owner_thread)) { - rb_raise( - rb_eRuntimeError, - "Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread" - ); + raise_error(eNativeRuntimeError, "Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread"); } else { // The previously active thread seems to have died without cleaning up after itself. // In this case, we can still go ahead and start the profiler BUT we make sure to disable any existing tracepoint @@ -793,7 +790,7 @@ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) { // Final preparations: Setup signal handler and enable tracepoints. We run these here and not in `_native_sampling_loop` // because they may raise exceptions. - install_sigprof_signal_handler(handle_sampling_signal, "handle_sampling_signal"); + install_sigprof_signal_handler(handle_sampling_signal); if (state->gc_profiling_enabled) rb_tracepoint_enable(state->gc_tracepoint); if (state->allocation_profiling_enabled) { rb_add_event_hook2( @@ -821,7 +818,7 @@ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) { NULL ); #else - rb_raise(rb_eArgError, "GVL profiling is not supported in this Ruby version"); + raise_error(eNativeArgumentError, "GVL profiling is not supported in this Ruby version"); #endif } @@ -851,7 +848,7 @@ static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED si // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec. // It SHOULD NOT be used for other purposes. static VALUE _native_install_testing_signal_handler(DDTRACE_UNUSED VALUE self) { - install_sigprof_signal_handler(testing_signal_handler, "testing_signal_handler"); + install_sigprof_signal_handler(testing_signal_handler); return Qtrue; } @@ -1093,7 +1090,7 @@ static void reset_stats_not_thread_safe(cpu_and_wall_time_worker_state *state) { static void sleep_for(uint64_t time_ns) { // As a simplification, we currently only support setting .tv_nsec if (time_ns >= SECONDS_AS_NS(1)) { - grab_gvl_and_raise(rb_eArgError, "sleep_for can only sleep for less than 1 second, time_ns: %"PRIu64, time_ns); + grab_gvl_and_raise(eNativeArgumentError, "sleep_for can only sleep for less than 1 second, time_ns: %"PRIu64, time_ns); } struct timespec time_to_sleep = {.tv_nsec = time_ns}; @@ -1284,7 +1281,7 @@ static VALUE rescued_sample_allocation(DDTRACE_UNUSED VALUE unused) { static void delayed_error(cpu_and_wall_time_worker_state *state, const char *error) { // If we can't raise an immediate exception at the calling site, use the asynchronous flow through the main worker loop. - stop_state(state, rb_exc_new_cstr(rb_eRuntimeError, error)); + stop_state(state, rb_exc_new_cstr(eNativeRuntimeError, error)); } static VALUE _native_delayed_error(DDTRACE_UNUSED VALUE self, VALUE instance, VALUE error_msg) { diff --git a/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.c b/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.c index 931adf85cb5..ab27fc6bda0 100644 --- a/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.c +++ b/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.c @@ -51,7 +51,7 @@ void discrete_dynamic_sampler_reset(discrete_dynamic_sampler *sampler, long now_ void discrete_dynamic_sampler_set_overhead_target_percentage(discrete_dynamic_sampler *sampler, double target_overhead, long now_ns) { if (target_overhead <= 0 || target_overhead > 100) { - rb_raise(rb_eArgError, "Target overhead must be a double between ]0,100] was %f", target_overhead); + raise_error(eNativeArgumentError, "Target overhead must be a double between ]0,100] was %f", target_overhead); } sampler->target_overhead = target_overhead; return discrete_dynamic_sampler_reset(sampler, now_ns); @@ -369,7 +369,7 @@ static VALUE _native_new(VALUE klass) { long now_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE); if (now_ns == 0) { - rb_raise(rb_eRuntimeError, "failed to get clock time"); + raise_error(eNativeRuntimeError, "failed to get clock time"); } discrete_dynamic_sampler_init(&state->sampler, "test sampler", now_ns); diff --git a/ext/datadog_profiling_native_extension/collectors_gc_profiling_helper.c b/ext/datadog_profiling_native_extension/collectors_gc_profiling_helper.c index e9dd6c727d9..343d250ac05 100644 --- a/ext/datadog_profiling_native_extension/collectors_gc_profiling_helper.c +++ b/ext/datadog_profiling_native_extension/collectors_gc_profiling_helper.c @@ -2,6 +2,7 @@ #include #include "collectors_gc_profiling_helper.h" +#include "ruby_helpers.h" // This helper is used by the Datadog::Profiling::Collectors::ThreadContext to profile garbage collection. // It's tested through that class' interfaces. @@ -71,7 +72,7 @@ uint8_t gc_profiling_set_metadata(ddog_prof_Label *labels, int labels_length) { 1; // gc type if (max_label_count > labels_length) { - rb_raise(rb_eArgError, "BUG: gc_profiling_set_metadata invalid labels_length (%d) < max_label_count (%d)", labels_length, max_label_count); + raise_error(eNativeArgumentError, "BUG: gc_profiling_set_metadata invalid labels_length (%d) < max_label_count (%d)", labels_length, max_label_count); } uint8_t label_pos = 0; @@ -119,7 +120,7 @@ uint8_t gc_profiling_set_metadata(ddog_prof_Label *labels, int labels_length) { }; if (label_pos > max_label_count) { - rb_raise(rb_eRuntimeError, "BUG: gc_profiling_set_metadata unexpected label_pos (%d) > max_label_count (%d)", label_pos, max_label_count); + raise_error(eNativeRuntimeError, "BUG: gc_profiling_set_metadata unexpected label_pos (%d) > max_label_count (%d)", label_pos, max_label_count); } return label_pos; diff --git a/ext/datadog_profiling_native_extension/collectors_idle_sampling_helper.c b/ext/datadog_profiling_native_extension/collectors_idle_sampling_helper.c index fcbc42eaa94..e7797822489 100644 --- a/ext/datadog_profiling_native_extension/collectors_idle_sampling_helper.c +++ b/ext/datadog_profiling_native_extension/collectors_idle_sampling_helper.c @@ -153,7 +153,7 @@ static void *run_idle_sampling_loop(void *state_ptr) { // Process pending action if (next_action == ACTION_RUN) { if (run_action_function == NULL) { - grab_gvl_and_raise(rb_eRuntimeError, "Unexpected NULL run_action_function in run_idle_sampling_loop"); + grab_gvl_and_raise(eNativeRuntimeError, "Unexpected NULL run_action_function in run_idle_sampling_loop"); } run_action_function(); @@ -206,7 +206,7 @@ static VALUE _native_stop(DDTRACE_UNUSED VALUE self, VALUE self_instance) { void idle_sampling_helper_request_action(VALUE self_instance, void (*run_action_function)(void)) { idle_sampling_loop_state *state; if (!rb_typeddata_is_kind_of(self_instance, &idle_sampling_helper_typed_data)) { - grab_gvl_and_raise(rb_eTypeError, "Wrong argument for idle_sampling_helper_request_action"); + grab_gvl_and_raise(eNativeTypeError, "Wrong argument for idle_sampling_helper_request_action"); } // This should never fail the the above check passes TypedData_Get_Struct(self_instance, idle_sampling_loop_state, &idle_sampling_helper_typed_data, state); diff --git a/ext/datadog_profiling_native_extension/collectors_stack.c b/ext/datadog_profiling_native_extension/collectors_stack.c index 94ca42cbdca..0df0451e46a 100644 --- a/ext/datadog_profiling_native_extension/collectors_stack.c +++ b/ext/datadog_profiling_native_extension/collectors_stack.c @@ -17,6 +17,7 @@ #include "datadog_ruby_common.h" #include "private_vm_api_access.h" +#include "ruby_helpers.h" #include "stack_recorder.h" #include "collectors_stack.h" @@ -284,11 +285,11 @@ void sample_thread( // here, but >= 0 makes this easier to understand/debug. bool only_wall_time = cpu_or_wall_sample && values.cpu_time_ns == 0 && values.wall_time_ns >= 0; - if (cpu_or_wall_sample && state_label == NULL) rb_raise(rb_eRuntimeError, "BUG: Unexpected missing state_label"); + if (cpu_or_wall_sample && state_label == NULL) raise_error(eNativeRuntimeError, "BUG: Unexpected missing state_label"); if (has_cpu_time) { state_label->str = DDOG_CHARSLICE_C("had cpu"); - if (labels.is_gvl_waiting_state) rb_raise(rb_eRuntimeError, "BUG: Unexpected combination of cpu-time with is_gvl_waiting"); + if (labels.is_gvl_waiting_state) raise_error(eNativeRuntimeError, "BUG: Unexpected combination of cpu-time with is_gvl_waiting"); } int top_of_stack_position = captured_frames - 1; @@ -600,8 +601,8 @@ bool prepare_sample_thread(VALUE thread, sampling_buffer *buffer) { } uint16_t sampling_buffer_check_max_frames(int max_frames) { - if (max_frames < 5) rb_raise(rb_eArgError, "Invalid max_frames: value must be >= 5"); - if (max_frames > MAX_FRAMES_LIMIT) rb_raise(rb_eArgError, "Invalid max_frames: value must be <= " MAX_FRAMES_LIMIT_AS_STRING); + if (max_frames < 5) raise_error(eNativeArgumentError, "Invalid max_frames: value must be >= 5"); + if (max_frames > MAX_FRAMES_LIMIT) raise_error(eNativeArgumentError, "Invalid max_frames: value must be <= " MAX_FRAMES_LIMIT_AS_STRING); return max_frames; } @@ -618,7 +619,7 @@ void sampling_buffer_initialize(sampling_buffer *buffer, uint16_t max_frames, dd void sampling_buffer_free(sampling_buffer *buffer) { if (buffer->max_frames == 0 || buffer->locations == NULL || buffer->stack_buffer == NULL) { - rb_raise(rb_eArgError, "sampling_buffer_free called with invalid buffer"); + raise_error(eNativeArgumentError, "sampling_buffer_free called with invalid buffer"); } ruby_xfree(buffer->stack_buffer); diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index f2cc76c6021..c9691845968 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -8,6 +8,7 @@ #include "helpers.h" #include "libdatadog_helpers.h" #include "private_vm_api_access.h" +#include "ruby_helpers.h" #include "stack_recorder.h" #include "time_helpers.h" #include "unsafe_api_calls_check.h" @@ -518,7 +519,7 @@ static VALUE _native_initialize(int argc, VALUE *argv, DDTRACE_UNUSED VALUE _sel } else if (otel_context_enabled == ID2SYM(rb_intern("both"))) { state->otel_context_enabled = OTEL_CONTEXT_ENABLED_BOTH; } else { - rb_raise(rb_eArgError, "Unexpected value for otel_context_enabled: %+" PRIsVALUE, otel_context_enabled); + raise_error(eNativeArgumentError, "Unexpected value for otel_context_enabled: %+" PRIsVALUE, otel_context_enabled); } global_waiting_for_gvl_threshold_ns = NUM2UINT(waiting_for_gvl_threshold_ns); @@ -539,7 +540,7 @@ static VALUE _native_initialize(int argc, VALUE *argv, DDTRACE_UNUSED VALUE _sel static VALUE _native_sample(DDTRACE_UNUSED VALUE _self, VALUE collector_instance, VALUE profiler_overhead_stack_thread, VALUE allow_exception) { ENFORCE_BOOLEAN(allow_exception); - if (!is_thread_alive(profiler_overhead_stack_thread)) rb_raise(rb_eArgError, "Unexpected: profiler_overhead_stack_thread is not alive"); + if (!is_thread_alive(profiler_overhead_stack_thread)) raise_error(eNativeArgumentError, "Unexpected: profiler_overhead_stack_thread is not alive"); if (allow_exception == Qfalse) debug_enter_unsafe_context(); @@ -831,7 +832,7 @@ VALUE thread_context_collector_sample_after_gc(VALUE self_instance) { TypedData_Get_Struct(self_instance, thread_context_collector_state, &thread_context_collector_typed_data, state); if (state->gc_tracking.wall_time_at_previous_gc_ns == INVALID_TIME) { - rb_raise(rb_eRuntimeError, "BUG: Unexpected call to sample_after_gc without valid GC information available"); + raise_error(eNativeRuntimeError, "BUG: Unexpected call to sample_after_gc without valid GC information available"); } int max_labels_needed_for_gc = 7; // Magic number gets validated inside gc_profiling_set_metadata @@ -998,7 +999,7 @@ static void trigger_sample_for_thread( // @ivoanjo: I wonder if C compilers are smart enough to statically prove this check never triggers unless someone // changes the code erroneously and remove it entirely? if (label_pos > max_label_count) { - rb_raise(rb_eRuntimeError, "BUG: Unexpected label_pos (%d) > max_label_count (%d)", label_pos, max_label_count); + raise_error(eNativeRuntimeError, "BUG: Unexpected label_pos (%d) > max_label_count (%d)", label_pos, max_label_count); } ddog_prof_Slice_Label slice_labels = {.ptr = labels, .len = label_pos}; @@ -1295,7 +1296,7 @@ static long update_time_since_previous_sample(long *time_at_previous_sample_ns, elapsed_time_ns = 0; } else { // We don't expect non-wall time to go backwards, so let's flag this as a bug - rb_raise(rb_eRuntimeError, "BUG: Unexpected negative elapsed_time_ns between samples"); + raise_error(eNativeRuntimeError, "BUG: Unexpected negative elapsed_time_ns between samples"); } } @@ -1961,7 +1962,7 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) { thread_context_collector_state *state; TypedData_Get_Struct(self_instance, thread_context_collector_state, &thread_context_collector_typed_data, state); - if (!state->timeline_enabled) rb_raise(rb_eRuntimeError, "GVL profiling requires timeline to be enabled"); + if (!state->timeline_enabled) raise_error(eNativeRuntimeError, "GVL profiling requires timeline to be enabled"); intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(current_thread); @@ -2154,7 +2155,7 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) { TypedData_Get_Struct(collector_instance, thread_context_collector_state, &thread_context_collector_typed_data, state); per_thread_context *thread_context = get_context_for(thread, state); - if (thread_context == NULL) rb_raise(rb_eArgError, "Unexpected: This method cannot be used unless the per-thread context for the thread already exists"); + if (thread_context == NULL) raise_error(eNativeArgumentError, "Unexpected: This method cannot be used unless the per-thread context for the thread already exists"); thread_context->cpu_time_at_previous_sample_ns += NUM2LONG(delta_ns); diff --git a/ext/datadog_profiling_native_extension/datadog_ruby_common.c b/ext/datadog_profiling_native_extension/datadog_ruby_common.c index ca402dca264..461fd245a7f 100644 --- a/ext/datadog_profiling_native_extension/datadog_ruby_common.c +++ b/ext/datadog_profiling_native_extension/datadog_ruby_common.c @@ -1,4 +1,5 @@ #include "datadog_ruby_common.h" +#include // IMPORTANT: Currently this file is copy-pasted between extensions. Make sure to update all versions when doing any change! @@ -18,6 +19,14 @@ void raise_unexpected_type(VALUE value, const char *value_name, const char *type ); } +void raise_error(VALUE error_class, const char *fmt, ...) { + va_list args; + va_start(args, fmt); + VALUE message = rb_vsprintf(fmt, args); + va_end(args); + rb_raise(error_class, "%"PRIsVALUE, message); +} + VALUE datadog_gem_version(void) { VALUE ddtrace_module = rb_const_get(rb_cObject, rb_intern("Datadog")); ENFORCE_TYPE(ddtrace_module, T_MODULE); diff --git a/ext/datadog_profiling_native_extension/datadog_ruby_common.h b/ext/datadog_profiling_native_extension/datadog_ruby_common.h index d55c2bb479f..adadee49fb8 100644 --- a/ext/datadog_profiling_native_extension/datadog_ruby_common.h +++ b/ext/datadog_profiling_native_extension/datadog_ruby_common.h @@ -32,6 +32,9 @@ NORETURN(void raise_unexpected_type(VALUE value, const char *value_name, const char *type_name, const char *file, int line, const char* function_name)); +// Helper to raise errors with formatted messages +NORETURN(void raise_error(VALUE error_class, const char *fmt, ...)); + // Helper to retrieve Datadog::VERSION::STRING VALUE datadog_gem_version(void); diff --git a/ext/datadog_profiling_native_extension/encoded_profile.c b/ext/datadog_profiling_native_extension/encoded_profile.c index cc28b9e40c5..1a446f6ef1b 100644 --- a/ext/datadog_profiling_native_extension/encoded_profile.c +++ b/ext/datadog_profiling_native_extension/encoded_profile.c @@ -1,6 +1,7 @@ #include "encoded_profile.h" #include "datadog_ruby_common.h" #include "libdatadog_helpers.h" +#include "ruby_helpers.h" // This class exists to wrap a ddog_prof_EncodedProfile into a Ruby object // This file implements the native bits of the Datadog::Profiling::EncodedProfile class @@ -41,7 +42,7 @@ VALUE from_ddog_prof_EncodedProfile(ddog_prof_EncodedProfile profile) { static ddog_ByteSlice get_bytes(ddog_prof_EncodedProfile *state) { ddog_prof_Result_ByteSlice raw_bytes = ddog_prof_EncodedProfile_bytes(state); if (raw_bytes.tag == DDOG_PROF_RESULT_BYTE_SLICE_ERR_BYTE_SLICE) { - rb_raise(rb_eRuntimeError, "Failed to get bytes from profile: %"PRIsVALUE, get_error_details_and_drop(&raw_bytes.err)); + raise_error(eNativeRuntimeError, "Failed to get bytes from profile: %"PRIsVALUE, get_error_details_and_drop(&raw_bytes.err)); } return raw_bytes.ok; } diff --git a/ext/datadog_profiling_native_extension/heap_recorder.c b/ext/datadog_profiling_native_extension/heap_recorder.c index f869d24c6d3..9dd861a930a 100644 --- a/ext/datadog_profiling_native_extension/heap_recorder.c +++ b/ext/datadog_profiling_native_extension/heap_recorder.c @@ -259,7 +259,7 @@ void heap_recorder_set_sample_rate(heap_recorder *heap_recorder, int sample_rate } if (sample_rate <= 0) { - rb_raise(rb_eArgError, "Heap sample rate must be a positive integer value but was %d", sample_rate); + raise_error(eNativeArgumentError, "Heap sample rate must be a positive integer value but was %d", sample_rate); } heap_recorder->sample_rate = sample_rate; @@ -300,7 +300,7 @@ void start_heap_allocation_recording(heap_recorder *heap_recorder, VALUE new_obj } if (heap_recorder->active_recording != NULL) { - rb_raise(rb_eRuntimeError, "Detected consecutive heap allocation recording starts without end."); + raise_error(eNativeRuntimeError, "Detected consecutive heap allocation recording starts without end."); } if (++heap_recorder->num_recordings_skipped < heap_recorder->sample_rate || @@ -323,7 +323,7 @@ void start_heap_allocation_recording(heap_recorder *heap_recorder, VALUE new_obj VALUE ruby_obj_id = rb_obj_id(new_obj); if (!FIXNUM_P(ruby_obj_id)) { - rb_raise(rb_eRuntimeError, "Detected a bignum object id. These are not supported by heap profiling."); + raise_error(eNativeRuntimeError, "Detected a bignum object id. These are not supported by heap profiling."); } heap_recorder->active_recording = object_record_new( @@ -371,7 +371,7 @@ static VALUE end_heap_allocation_recording(VALUE protect_args) { if (active_recording == NULL) { // Recording ended without having been started? - rb_raise(rb_eRuntimeError, "Ended a heap recording that was not started"); + raise_error(eNativeRuntimeError, "Ended a heap recording that was not started"); } // From now on, mark the global active recording as invalid so we can short-circuit at any point // and not end up with a still active recording. the local active_recording still holds the @@ -487,14 +487,14 @@ void heap_recorder_prepare_iteration(heap_recorder *heap_recorder) { if (heap_recorder->object_records_snapshot != NULL) { // we could trivially handle this but we raise to highlight and catch unexpected usages. - rb_raise(rb_eRuntimeError, "New heap recorder iteration prepared without the previous one having been finished."); + raise_error(eNativeRuntimeError, "New heap recorder iteration prepared without the previous one having been finished."); } heap_recorder_update(heap_recorder, /* full_update: */ true); heap_recorder->object_records_snapshot = st_copy(heap_recorder->object_records); if (heap_recorder->object_records_snapshot == NULL) { - rb_raise(rb_eRuntimeError, "Failed to create heap snapshot."); + raise_error(eNativeRuntimeError, "Failed to create heap snapshot."); } } @@ -505,7 +505,7 @@ void heap_recorder_finish_iteration(heap_recorder *heap_recorder) { if (heap_recorder->object_records_snapshot == NULL) { // we could trivially handle this but we raise to highlight and catch unexpected usages. - rb_raise(rb_eRuntimeError, "Heap recorder iteration finished without having been prepared."); + raise_error(eNativeRuntimeError, "Heap recorder iteration finished without having been prepared."); } st_free_table(heap_recorder->object_records_snapshot); @@ -581,7 +581,7 @@ typedef struct { VALUE heap_recorder_testonly_debug(heap_recorder *heap_recorder) { if (heap_recorder == NULL) { - rb_raise(rb_eArgError, "heap_recorder is NULL"); + raise_error(eNativeArgumentError, "heap_recorder is NULL"); } VALUE debug_str = rb_str_new2("object records:\n"); @@ -733,7 +733,7 @@ static void commit_recording(heap_recorder *heap_recorder, heap_record *heap_rec // needed to fully build the object_record. active_recording->heap_record = heap_record; if (heap_record->num_tracked_objects == UINT32_MAX) { - rb_raise(rb_eRuntimeError, "Reached maximum number of tracked objects for heap record"); + raise_error(eNativeRuntimeError, "Reached maximum number of tracked objects for heap record"); } heap_record->num_tracked_objects++; @@ -741,11 +741,11 @@ static void commit_recording(heap_recorder *heap_recorder, heap_record *heap_rec if (existing_error) { object_record *existing_record = NULL; st_lookup(heap_recorder->object_records, active_recording->obj_id, (st_data_t *) &existing_record); - if (existing_record == NULL) rb_raise(rb_eRuntimeError, "Unexpected NULL when reading existing record"); + if (existing_record == NULL) raise_error(eNativeRuntimeError, "Unexpected NULL when reading existing record"); VALUE existing_inspect = object_record_inspect(heap_recorder, existing_record); VALUE new_inspect = object_record_inspect(heap_recorder, active_recording); - rb_raise(rb_eRuntimeError, "Object ids are supposed to be unique. We got 2 allocation recordings with " + raise_error(eNativeRuntimeError, "Object ids are supposed to be unique. We got 2 allocation recordings with " "the same id. previous={%"PRIsVALUE"} new={%"PRIsVALUE"}", existing_inspect, new_inspect); } } @@ -781,7 +781,7 @@ static void cleanup_heap_record_if_unused(heap_recorder *heap_recorder, heap_rec } if (!st_delete(heap_recorder->heap_records, (st_data_t*) &heap_record, NULL)) { - rb_raise(rb_eRuntimeError, "Attempted to cleanup an untracked heap_record"); + raise_error(eNativeRuntimeError, "Attempted to cleanup an untracked heap_record"); }; heap_record_free(heap_recorder, heap_record); } @@ -791,14 +791,14 @@ static void on_committed_object_record_cleanup(heap_recorder *heap_recorder, obj // (See PROF-10656 Datadog-internal for details). Just in case, I've sprinkled a bunch of NULL tests in this function for now. // Once we figure out the issue we can get rid of them again. - if (heap_recorder == NULL) rb_raise(rb_eRuntimeError, "heap_recorder was NULL in on_committed_object_record_cleanup"); - if (heap_recorder->heap_records == NULL) rb_raise(rb_eRuntimeError, "heap_recorder->heap_records was NULL in on_committed_object_record_cleanup"); - if (record == NULL) rb_raise(rb_eRuntimeError, "record was NULL in on_committed_object_record_cleanup"); + if (heap_recorder == NULL) raise_error(eNativeRuntimeError, "heap_recorder was NULL in on_committed_object_record_cleanup"); + if (heap_recorder->heap_records == NULL) raise_error(eNativeRuntimeError, "heap_recorder->heap_records was NULL in on_committed_object_record_cleanup"); + if (record == NULL) raise_error(eNativeRuntimeError, "record was NULL in on_committed_object_record_cleanup"); // Starting with the associated heap record. There will now be one less tracked object pointing to it heap_record *heap_record = record->heap_record; - if (heap_record == NULL) rb_raise(rb_eRuntimeError, "heap_record was NULL in on_committed_object_record_cleanup"); + if (heap_record == NULL) raise_error(eNativeRuntimeError, "heap_record was NULL in on_committed_object_record_cleanup"); heap_record->num_tracked_objects--; @@ -862,7 +862,7 @@ heap_record* heap_record_new(heap_recorder *recorder, ddog_prof_Slice_Location l uint16_t frames_len = locations.len; if (frames_len > MAX_FRAMES_LIMIT) { // This is not expected as MAX_FRAMES_LIMIT is shared with the stacktrace construction mechanism - rb_raise(rb_eRuntimeError, "Found stack with more than %d frames (%d)", MAX_FRAMES_LIMIT, frames_len); + raise_error(eNativeRuntimeError, "Found stack with more than %d frames (%d)", MAX_FRAMES_LIMIT, frames_len); } heap_record *stack = calloc(1, sizeof(heap_record) + frames_len * sizeof(heap_frame)); // See "note on calloc vs ruby_xcalloc use" above stack->num_tracked_objects = 0; @@ -933,21 +933,21 @@ static void unintern_or_raise(heap_recorder *recorder, ddog_prof_ManagedStringId ddog_prof_MaybeError result = ddog_prof_ManagedStringStorage_unintern(recorder->string_storage, id); if (result.tag == DDOG_PROF_OPTION_ERROR_SOME_ERROR) { - rb_raise(rb_eRuntimeError, "Failed to unintern id: %"PRIsVALUE, get_error_details_and_drop(&result.some)); + raise_error(eNativeRuntimeError, "Failed to unintern id: %"PRIsVALUE, get_error_details_and_drop(&result.some)); } } static void unintern_all_or_raise(heap_recorder *recorder, ddog_prof_Slice_ManagedStringId ids) { ddog_prof_MaybeError result = ddog_prof_ManagedStringStorage_unintern_all(recorder->string_storage, ids); if (result.tag == DDOG_PROF_OPTION_ERROR_SOME_ERROR) { - rb_raise(rb_eRuntimeError, "Failed to unintern_all: %"PRIsVALUE, get_error_details_and_drop(&result.some)); + raise_error(eNativeRuntimeError, "Failed to unintern_all: %"PRIsVALUE, get_error_details_and_drop(&result.some)); } } static VALUE get_ruby_string_or_raise(heap_recorder *recorder, ddog_prof_ManagedStringId id) { ddog_StringWrapperResult get_string_result = ddog_prof_ManagedStringStorage_get_string(recorder->string_storage, id); if (get_string_result.tag == DDOG_STRING_WRAPPER_RESULT_ERR) { - rb_raise(rb_eRuntimeError, "Failed to get string: %"PRIsVALUE, get_error_details_and_drop(&get_string_result.err)); + raise_error(eNativeRuntimeError, "Failed to get string: %"PRIsVALUE, get_error_details_and_drop(&get_string_result.err)); } VALUE ruby_string = ruby_string_from_vec_u8(get_string_result.ok.message); ddog_StringWrapper_drop((ddog_StringWrapper *) &get_string_result.ok); @@ -962,7 +962,7 @@ static inline double ewma_stat(double previous, double current) { VALUE heap_recorder_testonly_is_object_recorded(heap_recorder *heap_recorder, VALUE obj_id) { if (heap_recorder == NULL) { - rb_raise(rb_eArgError, "heap_recorder is NULL"); + raise_error(eNativeArgumentError, "heap_recorder is NULL"); } // Check if object records contains an object with this object_id @@ -971,15 +971,15 @@ VALUE heap_recorder_testonly_is_object_recorded(heap_recorder *heap_recorder, VA void heap_recorder_testonly_reset_last_update(heap_recorder *heap_recorder) { if (heap_recorder == NULL) { - rb_raise(rb_eArgError, "heap_recorder is NULL"); + raise_error(eNativeArgumentError, "heap_recorder is NULL"); } heap_recorder->last_update_ns = 0; } void heap_recorder_testonly_benchmark_intern(heap_recorder *heap_recorder, ddog_CharSlice string, int times, bool use_all) { - if (heap_recorder == NULL) rb_raise(rb_eArgError, "heap profiling must be enabled"); - if (times > REUSABLE_FRAME_DETAILS_SIZE) rb_raise(rb_eArgError, "times cannot be > than REUSABLE_FRAME_DETAILS_SIZE"); + if (heap_recorder == NULL) raise_error(eNativeArgumentError, "heap profiling must be enabled"); + if (times > REUSABLE_FRAME_DETAILS_SIZE) raise_error(eNativeArgumentError, "times cannot be > than REUSABLE_FRAME_DETAILS_SIZE"); if (use_all) { ddog_CharSlice *strings = heap_recorder->reusable_char_slices; diff --git a/ext/datadog_profiling_native_extension/http_transport.c b/ext/datadog_profiling_native_extension/http_transport.c index 52e4db67c5d..b0c12869926 100644 --- a/ext/datadog_profiling_native_extension/http_transport.c +++ b/ext/datadog_profiling_native_extension/http_transport.c @@ -85,7 +85,7 @@ static ddog_prof_Endpoint endpoint_from(VALUE exporter_configuration) { return ddog_prof_Endpoint_agent(char_slice_from_ruby_string(base_url)); } else { - rb_raise(rb_eArgError, "Failed to initialize transport: Unexpected working mode, expected :agentless or :agent"); + raise_error(eNativeArgumentError, "Failed to initialize transport: Unexpected working mode, expected :agentless or :agent"); } } @@ -112,7 +112,7 @@ static ddog_prof_ProfileExporter_Result create_exporter(VALUE exporter_configura } static void validate_token(ddog_CancellationToken token, const char *file, int line) { - if (token.inner == NULL) rb_raise(rb_eRuntimeError, "Unexpected: Validation token was empty at %s:%d", file, line); + if (token.inner == NULL) raise_error(eNativeRuntimeError, "Unexpected: Validation token was empty at %s:%d", file, line); } static VALUE handle_exporter_failure(ddog_prof_ProfileExporter_Result exporter_result) { diff --git a/ext/datadog_profiling_native_extension/libdatadog_helpers.c b/ext/datadog_profiling_native_extension/libdatadog_helpers.c index 16185b8080b..72f63ebefe0 100644 --- a/ext/datadog_profiling_native_extension/libdatadog_helpers.c +++ b/ext/datadog_profiling_native_extension/libdatadog_helpers.c @@ -1,6 +1,7 @@ #include "libdatadog_helpers.h" #include +#include "ruby_helpers.h" const char *ruby_value_type_to_string(enum ruby_value_type type) { return ruby_value_type_to_char_slice(type).ptr; @@ -66,7 +67,7 @@ ddog_prof_ManagedStringId intern_or_raise(ddog_prof_ManagedStringStorage string_ ddog_prof_ManagedStringStorageInternResult intern_result = ddog_prof_ManagedStringStorage_intern(string_storage, string); if (intern_result.tag == DDOG_PROF_MANAGED_STRING_STORAGE_INTERN_RESULT_ERR) { - rb_raise(rb_eRuntimeError, "Failed to intern string: %"PRIsVALUE, get_error_details_and_drop(&intern_result.err)); + raise_error(eNativeRuntimeError, "Failed to intern string: %"PRIsVALUE, get_error_details_and_drop(&intern_result.err)); } return intern_result.ok; } @@ -79,6 +80,6 @@ void intern_all_or_raise( ) { ddog_prof_MaybeError result = ddog_prof_ManagedStringStorage_intern_all(string_storage, strings, output_ids, output_ids_size); if (result.tag == DDOG_PROF_OPTION_ERROR_SOME_ERROR) { - rb_raise(rb_eRuntimeError, "Failed to intern_all: %"PRIsVALUE, get_error_details_and_drop(&result.some)); + raise_error(eNativeRuntimeError, "Failed to intern_all: %"PRIsVALUE, get_error_details_and_drop(&result.some)); } } diff --git a/ext/datadog_profiling_native_extension/profiling.c b/ext/datadog_profiling_native_extension/profiling.c index bcf97709c14..822146eedc8 100644 --- a/ext/datadog_profiling_native_extension/profiling.c +++ b/ext/datadog_profiling_native_extension/profiling.c @@ -40,6 +40,7 @@ static VALUE _native_enforce_success(DDTRACE_UNUSED VALUE _self, VALUE syserr_er static void *trigger_enforce_success(void *trigger_args); static VALUE _native_malloc_stats(DDTRACE_UNUSED VALUE _self); static VALUE _native_safe_object_info(DDTRACE_UNUSED VALUE _self, VALUE obj); +static VALUE _native_raise_native_error_with_invalid_class(DDTRACE_UNUSED VALUE _self, VALUE invalid_exception_class, VALUE message, VALUE telemetry_message); void DDTRACE_EXPORT Init_datadog_profiling_native_extension(void) { // The profiler still has a lot of limitations around being used in Ractors BUT for now we're choosing to take care of those @@ -55,7 +56,7 @@ void DDTRACE_EXPORT Init_datadog_profiling_native_extension(void) { rb_define_singleton_method(native_extension_module, "native_working?", native_working_p, 0); rb_funcall(native_extension_module, rb_intern("private_class_method"), 1, ID2SYM(rb_intern("native_working?"))); - ruby_helpers_init(); + ruby_helpers_init(profiling_module); collectors_cpu_and_wall_time_worker_init(profiling_module); collectors_discrete_dynamic_sampler_init(profiling_module); collectors_dynamic_sampling_rate_init(profiling_module); @@ -84,6 +85,7 @@ void DDTRACE_EXPORT Init_datadog_profiling_native_extension(void) { rb_define_singleton_method(testing_module, "_native_enforce_success", _native_enforce_success, 2); rb_define_singleton_method(testing_module, "_native_malloc_stats", _native_malloc_stats, 0); rb_define_singleton_method(testing_module, "_native_safe_object_info", _native_safe_object_info, 1); + rb_define_singleton_method(testing_module, "_native_raise_native_error_with_invalid_class", _native_raise_native_error_with_invalid_class, 3); } static VALUE native_working_p(DDTRACE_UNUSED VALUE _self) { @@ -97,7 +99,7 @@ static VALUE native_working_p(DDTRACE_UNUSED VALUE _self) { typedef struct { VALUE exception_class; char *test_message; - int test_message_arg; + char *test_message_arg; } trigger_grab_gvl_and_raise_arguments; static VALUE _native_grab_gvl_and_raise(DDTRACE_UNUSED VALUE _self, VALUE exception_class, VALUE test_message, VALUE test_message_arg, VALUE release_gvl) { @@ -107,24 +109,24 @@ static VALUE _native_grab_gvl_and_raise(DDTRACE_UNUSED VALUE _self, VALUE except args.exception_class = exception_class; args.test_message = StringValueCStr(test_message); - args.test_message_arg = test_message_arg != Qnil ? NUM2INT(test_message_arg) : -1; + args.test_message_arg = test_message_arg != Qnil ? StringValueCStr(test_message_arg) : NULL; if (RTEST(release_gvl)) { rb_thread_call_without_gvl(trigger_grab_gvl_and_raise, &args, NULL, NULL); } else { - grab_gvl_and_raise(args.exception_class, "%s", args.test_message); + private_grab_gvl_and_raise(args.exception_class, args.test_message, args.test_message_arg); } - rb_raise(rb_eRuntimeError, "Failed to raise exception in _native_grab_gvl_and_raise; this should never happen"); + raise_error(eNativeRuntimeError, "Failed to raise exception in _native_grab_gvl_and_raise; this should never happen"); } static void *trigger_grab_gvl_and_raise(void *trigger_args) { trigger_grab_gvl_and_raise_arguments *args = (trigger_grab_gvl_and_raise_arguments *) trigger_args; - if (args->test_message_arg >= 0) { - grab_gvl_and_raise(args->exception_class, "%s%d", args->test_message, args->test_message_arg); + if (args->test_message_arg != NULL) { + private_grab_gvl_and_raise(args->exception_class, args->test_message, args->test_message_arg); } else { - grab_gvl_and_raise(args->exception_class, "%s", args->test_message); + private_grab_gvl_and_raise(args->exception_class, args->test_message); } return NULL; @@ -133,7 +135,7 @@ static void *trigger_grab_gvl_and_raise(void *trigger_args) { typedef struct { int syserr_errno; char *test_message; - int test_message_arg; + char *test_message_arg; } trigger_grab_gvl_and_raise_syserr_arguments; static VALUE _native_grab_gvl_and_raise_syserr(DDTRACE_UNUSED VALUE _self, VALUE syserr_errno, VALUE test_message, VALUE test_message_arg, VALUE release_gvl) { @@ -143,24 +145,24 @@ static VALUE _native_grab_gvl_and_raise_syserr(DDTRACE_UNUSED VALUE _self, VALUE args.syserr_errno = NUM2INT(syserr_errno); args.test_message = StringValueCStr(test_message); - args.test_message_arg = test_message_arg != Qnil ? NUM2INT(test_message_arg) : -1; + args.test_message_arg = test_message_arg != Qnil ? StringValueCStr(test_message_arg) : NULL; if (RTEST(release_gvl)) { rb_thread_call_without_gvl(trigger_grab_gvl_and_raise_syserr, &args, NULL, NULL); } else { - grab_gvl_and_raise_syserr(args.syserr_errno, "%s", args.test_message); + grab_gvl_and_raise_syserr(args.syserr_errno, args.test_message, args.test_message_arg); } - rb_raise(rb_eRuntimeError, "Failed to raise exception in _native_grab_gvl_and_raise_syserr; this should never happen"); + raise_error(eNativeRuntimeError, "Failed to raise exception in _native_grab_gvl_and_raise_syserr; this should never happen"); } static void *trigger_grab_gvl_and_raise_syserr(void *trigger_args) { trigger_grab_gvl_and_raise_syserr_arguments *args = (trigger_grab_gvl_and_raise_syserr_arguments *) trigger_args; - if (args->test_message_arg >= 0) { - grab_gvl_and_raise_syserr(args->syserr_errno, "%s%d", args->test_message, args->test_message_arg); + if (args->test_message_arg != NULL) { + grab_gvl_and_raise_syserr(args->syserr_errno, args->test_message, args->test_message_arg); } else { - grab_gvl_and_raise_syserr(args->syserr_errno, "%s", args->test_message); + grab_gvl_and_raise_syserr(args->syserr_errno, args->test_message); } return NULL; @@ -183,7 +185,7 @@ static void *testing_is_current_thread_holding_the_gvl(DDTRACE_UNUSED void *_unu } static VALUE _native_install_holding_the_gvl_signal_handler(DDTRACE_UNUSED VALUE _self) { - install_sigprof_signal_handler(holding_the_gvl_signal_handler, "holding_the_gvl_signal_handler"); + install_sigprof_signal_handler(holding_the_gvl_signal_handler); return Qtrue; } @@ -246,7 +248,7 @@ static VALUE _native_trigger_holding_the_gvl_signal_handler_on(DDTRACE_UNUSED VA replace_sigprof_signal_handler_with_empty_handler(holding_the_gvl_signal_handler); - if (holding_the_gvl_signal_handler_result[0] == Qfalse) rb_raise(rb_eRuntimeError, "Could not signal background_thread"); + if (holding_the_gvl_signal_handler_result[0] == Qfalse) raise_error(eNativeRuntimeError, "Could not signal background_thread"); VALUE result = rb_hash_new(); rb_hash_aset(result, ID2SYM(rb_intern("ruby_thread_has_gvl_p")), holding_the_gvl_signal_handler_result[1]); @@ -282,3 +284,10 @@ static VALUE _native_malloc_stats(DDTRACE_UNUSED VALUE _self) { static VALUE _native_safe_object_info(DDTRACE_UNUSED VALUE _self, VALUE obj) { return rb_str_new_cstr(safe_object_info(obj)); } + +static VALUE _native_raise_native_error_with_invalid_class(DDTRACE_UNUSED VALUE _self, VALUE invalid_exception_class, VALUE message, VALUE telemetry_message) { + ENFORCE_TYPE(message, T_STRING); + ENFORCE_TYPE(telemetry_message, T_STRING); + + private_raise_native_error(invalid_exception_class, StringValueCStr(message), StringValueCStr(telemetry_message)); +} diff --git a/ext/datadog_profiling_native_extension/ruby_helpers.c b/ext/datadog_profiling_native_extension/ruby_helpers.c index 61b9db1af4f..1be9405413a 100644 --- a/ext/datadog_profiling_native_extension/ruby_helpers.c +++ b/ext/datadog_profiling_native_extension/ruby_helpers.c @@ -11,43 +11,147 @@ static VALUE module_object_space = Qnil; static ID _id2ref_id = Qnil; static ID inspect_id = Qnil; static ID to_s_id = Qnil; +static ID new_id = 0; +static ID telemetry_message_id = Qnil; -void ruby_helpers_init(void) { +// Exception classes defined in Ruby, in the `Datadog::Profiling` namespace. +VALUE eNativeRuntimeError = Qnil; +VALUE eNativeArgumentError = Qnil; +VALUE eNativeTypeError = Qnil; + +void ruby_helpers_init(VALUE profiling_module) { rb_global_variable(&module_object_space); module_object_space = rb_const_get(rb_cObject, rb_intern("ObjectSpace")); _id2ref_id = rb_intern("_id2ref"); inspect_id = rb_intern("inspect"); to_s_id = rb_intern("to_s"); + new_id = rb_intern("new"); + telemetry_message_id = rb_intern("@telemetry_message"); + + eNativeRuntimeError = rb_const_get(profiling_module, rb_intern("NativeRuntimeError")); + rb_global_variable(&eNativeRuntimeError); + eNativeArgumentError = rb_const_get(profiling_module, rb_intern("NativeArgumentError")); + rb_global_variable(&eNativeArgumentError); + eNativeTypeError = rb_const_get(profiling_module, rb_intern("NativeTypeError")); + rb_global_variable(&eNativeTypeError); } #define MAX_RAISE_MESSAGE_SIZE 256 +#define FORMAT_VA_ERROR_MESSAGE(buf, fmt) \ + char buf[MAX_RAISE_MESSAGE_SIZE]; \ + va_list buf##_args; \ + va_start(buf##_args, fmt); \ + vsnprintf(buf, MAX_RAISE_MESSAGE_SIZE, fmt, buf##_args); \ + va_end(buf##_args); + +// Raises a NativeError exception with seperate telemetry-safe and detailed messages. +void private_raise_native_error(VALUE native_exception_class, const char *detailed_message, const char *static_message) { + if (native_exception_class != eNativeRuntimeError && + native_exception_class != eNativeArgumentError && + native_exception_class != eNativeTypeError) { + + const char* fmt = "private_raise_native_error called with an exception that might not support two error messages. " \ + "Expected eNativeRuntimeError, eNativeArgumentError, or eNativeTypeError, was: %s"; + + VALUE exception = rb_funcall( + eNativeArgumentError, + new_id, + 2, + rb_sprintf(fmt, rb_class2name(native_exception_class) ?: "(Unknown)"), + rb_str_new_cstr(fmt) + ); + rb_exc_raise(exception); + } + + VALUE exception = rb_funcall( + native_exception_class, + new_id, + 2, + rb_str_new_cstr(detailed_message), + rb_str_new_cstr(static_message) + ); + rb_exc_raise(exception); +} + +// Use `raise_error` the macro instead, as it provides additional argument checks. +void private_raise_error(VALUE native_exception_class, const char *fmt, ...) { + FORMAT_VA_ERROR_MESSAGE(formatted_msg, fmt); + private_raise_native_error(native_exception_class, formatted_msg, fmt); +} + typedef struct { VALUE exception_class; char exception_message[MAX_RAISE_MESSAGE_SIZE]; + char telemetry_message[MAX_RAISE_MESSAGE_SIZE]; } raise_args; static void *trigger_raise(void *raise_arguments) { raise_args *args = (raise_args *) raise_arguments; - rb_raise(args->exception_class, "%s", args->exception_message); + + private_raise_native_error( + args->exception_class, + args->exception_message, + args->telemetry_message + ); } -void grab_gvl_and_raise(VALUE exception_class, const char *format_string, ...) { +// Raises a SysErr exception with seperate telemetry-safe and detailed messages. +// The telemetry-safe message is set in the instance variable `@telemetry_message`. +void private_raise_syserr(int syserr_errno, const char *detailed_message, const char *static_message) { + VALUE error = rb_syserr_new(syserr_errno, detailed_message); + + // Because there are 150+ Errno error classes, + // we set the telemetry-safe message to an instance variable, instead + // of creating/modifying a large amount of exception classes. + rb_ivar_set(error, telemetry_message_id, rb_str_new_cstr(static_message)); + + rb_exc_raise(error); +} + +// The message must be statically bound and checked. +void raise_telemetry_safe_error(VALUE native_exception_class, const char *telemetry_safe_format, ...) { + FORMAT_VA_ERROR_MESSAGE(telemetry_safe_message, telemetry_safe_format); + private_raise_native_error(native_exception_class, telemetry_safe_message, telemetry_safe_message); +} + +// The message must be statically bound and checked. +void raise_telemetry_safe_syserr(int syserr_errno, const char *telemetry_safe_format, ...) { + FORMAT_VA_ERROR_MESSAGE(telemetry_safe_message, telemetry_safe_format); + + private_raise_syserr(syserr_errno, telemetry_safe_message, telemetry_safe_message); +} + +void private_grab_gvl_and_raise(VALUE native_exception_class, const char *format_string, ...) { raise_args args; - args.exception_class = exception_class; + args.exception_class = native_exception_class; - va_list format_string_arguments; - va_start(format_string_arguments, format_string); - vsnprintf(args.exception_message, MAX_RAISE_MESSAGE_SIZE, format_string, format_string_arguments); + FORMAT_VA_ERROR_MESSAGE(formatted_exception_message, format_string); + snprintf(args.exception_message, MAX_RAISE_MESSAGE_SIZE, "%s", formatted_exception_message); + snprintf(args.telemetry_message, MAX_RAISE_MESSAGE_SIZE, "%s", format_string); if (is_current_thread_holding_the_gvl()) { - rb_raise( - rb_eRuntimeError, - "grab_gvl_and_raise called by thread holding the global VM lock. exception_message: '%s'", - args.exception_message + // Render telemetry message without formatted arguments, only the exception class. + char telemetry_message[MAX_RAISE_MESSAGE_SIZE]; + snprintf( + telemetry_message, + MAX_RAISE_MESSAGE_SIZE, + "grab_gvl_and_raise called by thread holding the global VM lock: %s (%s)", + format_string, + rb_class2name(native_exception_class) ?: "(Unknown)" + ); + // Render the full exception message. + char exception_message[MAX_RAISE_MESSAGE_SIZE]; + snprintf( + exception_message, + MAX_RAISE_MESSAGE_SIZE, + "grab_gvl_and_raise called by thread holding the global VM lock: %s (%s)", + args.exception_message, + rb_class2name(native_exception_class) ?: "(Unknown)" ); + private_raise_native_error(eNativeRuntimeError, exception_message, telemetry_message); } rb_thread_call_with_gvl(trigger_raise, &args); @@ -62,7 +166,12 @@ typedef struct { static void *trigger_syserr_raise(void *syserr_raise_arguments) { syserr_raise_args *args = (syserr_raise_args *) syserr_raise_arguments; - rb_syserr_fail(args->syserr_errno, args->exception_message); + + private_raise_syserr( + args->syserr_errno, + args->exception_message, + args->exception_message + ); } void grab_gvl_and_raise_syserr(int syserr_errno, const char *format_string, ...) { @@ -70,17 +179,40 @@ void grab_gvl_and_raise_syserr(int syserr_errno, const char *format_string, ...) args.syserr_errno = syserr_errno; - va_list format_string_arguments; - va_start(format_string_arguments, format_string); - vsnprintf(args.exception_message, MAX_RAISE_MESSAGE_SIZE, format_string, format_string_arguments); + FORMAT_VA_ERROR_MESSAGE(formatted_exception_message, format_string); + snprintf(args.exception_message, MAX_RAISE_MESSAGE_SIZE, "%s", formatted_exception_message); if (is_current_thread_holding_the_gvl()) { - rb_raise( - rb_eRuntimeError, + // Errno exceptions have unique classes for each errno value. + // Because we are raising a RuntimeError instead, in order to preserve + // the same information as the original errno exception, we include + // the errno value in the message and telemetry message. + + // Add errno and static format string to the telemetry message. + char telemetry_message[MAX_RAISE_MESSAGE_SIZE]; + snprintf( + telemetry_message, + MAX_RAISE_MESSAGE_SIZE, + "grab_gvl_and_raise_syserr called by thread holding the global VM lock. syserr_errno: %d, exception_message: '%s'", + syserr_errno, + format_string + ); + + // Render the full exception message. + char exception_message[MAX_RAISE_MESSAGE_SIZE]; + snprintf( + exception_message, + MAX_RAISE_MESSAGE_SIZE, "grab_gvl_and_raise_syserr called by thread holding the global VM lock. syserr_errno: %d, exception_message: '%s'", syserr_errno, args.exception_message ); + + private_raise_native_error( + eNativeRuntimeError, + exception_message, + telemetry_message + ); } rb_thread_call_with_gvl(trigger_syserr_raise, &args); diff --git a/ext/datadog_profiling_native_extension/ruby_helpers.h b/ext/datadog_profiling_native_extension/ruby_helpers.h index 5c135a21d14..d17b716114e 100644 --- a/ext/datadog_profiling_native_extension/ruby_helpers.h +++ b/ext/datadog_profiling_native_extension/ruby_helpers.h @@ -1,11 +1,12 @@ #pragma once #include +#include #include "datadog_ruby_common.h" // Initialize internal data needed by some ruby helpers. Should be called during start, before any actual // usage of ruby helpers. -void ruby_helpers_init(void); +void ruby_helpers_init(VALUE profiling_module); // Processes any pending interruptions, including exceptions to be raised. // If there's an exception to be raised, it raises it. In that case, this function does not return. @@ -39,15 +40,55 @@ static inline int check_if_pending_exception(void) { #define VALUE_COUNT(array) (sizeof(array) / sizeof(VALUE)) +// Global references to Datadog::Profiling exception classes. +extern VALUE eNativeRuntimeError; +extern VALUE eNativeArgumentError; +extern VALUE eNativeTypeError; +// Raises an exception of the specified class with the formatted string as its message. +// This macro ensures that the literay string is sent for telemetry, while the formatted +// message is the default `Exception#message`. +// *Ruby exceptions not raised through this function will not be reported via telemetry.* +// Only the following error classes are supported, as they require an extra field for +// the telemetry-safe string: NativeRuntimeError, NativeArgumentError, NativeTypeError. + +#define raise_error(native_exception_class, fmt, ...) \ + private_raise_error(native_exception_class, "" fmt, ##__VA_ARGS__) + +#define grab_gvl_and_raise(native_exception_class, fmt, ...) \ + private_grab_gvl_and_raise(native_exception_class, "" fmt, ##__VA_ARGS__) + +// The message must be statically bound and checked. NORETURN( - void grab_gvl_and_raise(VALUE exception_class, const char *format_string, ...) + void raise_telemetry_safe_error(VALUE native_exception_class, const char *telemetry_safe_format, ...) __attribute__ ((format (printf, 2, 3))); ); + +// The message must be statically bound and checked. +NORETURN( + void raise_telemetry_safe_syserr(int syserr_errno, const char *telemetry_safe_format, ...) + __attribute__ ((format (printf, 2, 3))); +); + NORETURN( void grab_gvl_and_raise_syserr(int syserr_errno, const char *format_string, ...) __attribute__ ((format (printf, 2, 3))); ); +NORETURN( + void private_raise_error(VALUE native_exception_class, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +); + +NORETURN( + void private_grab_gvl_and_raise(VALUE native_exception_class, const char *format_string, ...) + __attribute__ ((format (printf, 2, 3))); +); + +// NOTE: Only used externally for testing, by `_native_raise_native_error_with_invalid_class` +NORETURN( + void private_raise_native_error(VALUE native_exception_class, const char *detailed_message, const char *static_message) +); + #define ENFORCE_SUCCESS_GVL(expression) ENFORCE_SUCCESS_HELPER(expression, true) #define ENFORCE_SUCCESS_NO_GVL(expression) ENFORCE_SUCCESS_HELPER(expression, false) @@ -67,6 +108,25 @@ NORETURN(void raise_syserr( const char *function_name )); +typedef enum { + SIGNAL_HANDLER_NAME_NULL, // Reserved to avoid accidental 0-value usage + SIGNAL_HANDLER_NAME_handle_sampling_signal, + SIGNAL_HANDLER_NAME_testing_signal_handler, + SIGNAL_HANDLER_NAME_holding_the_gvl_signal_handler, + SIGNAL_HANDLER_NAME_empty_signal_handler, + SIGNAL_HANDLER_NAME_COUNT +} signal_handler_name_t; + +static const char *const SIGNAL_HANDLER_NAMES[SIGNAL_HANDLER_NAME_COUNT] = { + [SIGNAL_HANDLER_NAME_NULL] = "", // Unused + [SIGNAL_HANDLER_NAME_handle_sampling_signal] = "handle_sampling_signal", + [SIGNAL_HANDLER_NAME_testing_signal_handler] = "testing_signal_handler", + [SIGNAL_HANDLER_NAME_holding_the_gvl_signal_handler] = "holding_the_gvl_signal_handler", + [SIGNAL_HANDLER_NAME_empty_signal_handler] = "empty_signal_handler" +}; + +#define SIGNAL_HANDLER_NAME(var) SIGNAL_HANDLER_NAME_##var + // Native wrapper to get an object ref from an id. Returns true on success and // writes the ref to the value pointer parameter if !NULL. False if id doesn't // reference a valid object (in which case value is not changed). diff --git a/ext/datadog_profiling_native_extension/setup_signal_handler.c b/ext/datadog_profiling_native_extension/setup_signal_handler.c index fa4bb27e95c..0477337d2db 100644 --- a/ext/datadog_profiling_native_extension/setup_signal_handler.c +++ b/ext/datadog_profiling_native_extension/setup_signal_handler.c @@ -11,23 +11,23 @@ static void install_sigprof_signal_handler_internal( void (*signal_handler_function)(int, siginfo_t *, void *), - const char *handler_pretty_name, + signal_handler_name_t handler_pretty_name, void (*signal_handler_to_replace)(int, siginfo_t *, void *) ); void empty_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) { } -void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *), const char *handler_pretty_name) { - install_sigprof_signal_handler_internal(signal_handler_function, handler_pretty_name, NULL); +void private_install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *), signal_handler_name_t handler_name) { + install_sigprof_signal_handler_internal(signal_handler_function, handler_name, NULL); } void replace_sigprof_signal_handler_with_empty_handler(void (*expected_existing_handler)(int, siginfo_t *, void *)) { - install_sigprof_signal_handler_internal(empty_signal_handler, "empty_signal_handler", expected_existing_handler); + install_sigprof_signal_handler_internal(empty_signal_handler, SIGNAL_HANDLER_NAME(empty_signal_handler), expected_existing_handler); } static void install_sigprof_signal_handler_internal( void (*signal_handler_function)(int, siginfo_t *, void *), - const char *handler_pretty_name, + signal_handler_name_t handler_name, void (*signal_handler_to_replace)(int, siginfo_t *, void *) ) { struct sigaction existing_signal_handler_config = {.sa_sigaction = NULL}; @@ -37,8 +37,17 @@ static void install_sigprof_signal_handler_internal( }; sigemptyset(&signal_handler_config.sa_mask); + // Get statically bound handler name, ensuring it's safe for telemetry. + const char *handler_pretty_name; + if (handler_name > SIGNAL_HANDLER_NAME_NULL && handler_name < SIGNAL_HANDLER_NAME_COUNT) { + handler_pretty_name = SIGNAL_HANDLER_NAMES[handler_name]; + } else { + // Add the new handler to SIGNAL_HANDLER_NAMES if you see this + handler_pretty_name = "(UNNAMED)"; + } + if (sigaction(SIGPROF, &signal_handler_config, &existing_signal_handler_config) != 0) { - rb_exc_raise(rb_syserr_new_str(errno, rb_sprintf("Could not install profiling signal handler (%s)", handler_pretty_name))); + raise_telemetry_safe_syserr(errno, "Could not install profiling signal handler (%s)", handler_pretty_name); } // Because signal handler functions are global, let's check if we're not stepping on someone else's toes. @@ -56,23 +65,19 @@ static void install_sigprof_signal_handler_internal( // of the installation. if (sigaction(SIGPROF, &existing_signal_handler_config, NULL) != 0) { - rb_exc_raise( - rb_syserr_new_str( - errno, - rb_sprintf( - "Failed to install profiling signal handler (%s): " \ - "While installing a SIGPROF signal handler, the profiler detected that another software/library/gem had " \ - "previously installed a different SIGPROF signal handler. " \ - "The profiler tried to restore the previous SIGPROF signal handler, but this failed. " \ - "The other software/library/gem may have been left in a broken state. ", - handler_pretty_name - ) - ) + raise_telemetry_safe_syserr( + errno, + "Failed to install profiling signal handler (%s): " \ + "While installing a SIGPROF signal handler, the profiler detected that another software/library/gem had " \ + "previously installed a different SIGPROF signal handler. " \ + "The profiler tried to restore the previous SIGPROF signal handler, but this failed. " \ + "The other software/library/gem may have been left in a broken state. ", + handler_pretty_name ); } - rb_raise( - rb_eRuntimeError, + raise_telemetry_safe_error( + eNativeRuntimeError, "Could not install profiling signal handler (%s): There's a pre-existing SIGPROF signal handler", handler_pretty_name ); @@ -96,7 +101,13 @@ static inline void toggle_sigprof_signal_handler_for_current_thread(int action) sigemptyset(&signals_to_toggle); sigaddset(&signals_to_toggle, SIGPROF); int error = pthread_sigmask(action, &signals_to_toggle, NULL); - if (error) rb_exc_raise(rb_syserr_new_str(error, rb_sprintf("Unexpected failure in pthread_sigmask, action=%d", action))); + + if (error) { + // Ensure we know statically the possible values for `action`. + int telemetry_safe_action = (action >= SIG_BLOCK && action <= SIG_UNBLOCK) ? action : -1; + + raise_telemetry_safe_syserr(error, "Unexpected failure in pthread_sigmask, action=%d", telemetry_safe_action); + } } void block_sigprof_signal_handler_from_running_in_current_thread(void) { diff --git a/ext/datadog_profiling_native_extension/setup_signal_handler.h b/ext/datadog_profiling_native_extension/setup_signal_handler.h index 3c969986c4d..c35670fcbe2 100644 --- a/ext/datadog_profiling_native_extension/setup_signal_handler.h +++ b/ext/datadog_profiling_native_extension/setup_signal_handler.h @@ -2,9 +2,12 @@ #include #include "datadog_ruby_common.h" +#include "ruby_helpers.h" void empty_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext); -void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *), const char *handler_pretty_name); +#define install_sigprof_signal_handler(signal_handler_function) \ + private_install_sigprof_signal_handler(signal_handler_function, SIGNAL_HANDLER_NAME(signal_handler_function)) +void private_install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *), signal_handler_name_t handler_pretty_name); void replace_sigprof_signal_handler_with_empty_handler(void (*expected_existing_handler)(int, siginfo_t *, void *)); void remove_sigprof_signal_handler(void); void block_sigprof_signal_handler_from_running_in_current_thread(void); diff --git a/ext/datadog_profiling_native_extension/stack_recorder.c b/ext/datadog_profiling_native_extension/stack_recorder.c index bc3b3439500..a7248863e14 100644 --- a/ext/datadog_profiling_native_extension/stack_recorder.c +++ b/ext/datadog_profiling_native_extension/stack_recorder.c @@ -349,7 +349,7 @@ static VALUE _native_new(VALUE klass) { ddog_prof_ManagedStringStorageNewResult string_storage = ddog_prof_ManagedStringStorage_new(); if (string_storage.tag == DDOG_PROF_MANAGED_STRING_STORAGE_NEW_RESULT_ERR) { - rb_raise(rb_eRuntimeError, "Failed to initialize string storage: %"PRIsVALUE, get_error_details_and_drop(&string_storage.err)); + raise_error(eNativeRuntimeError, "Failed to initialize string storage: %"PRIsVALUE, get_error_details_and_drop(&string_storage.err)); } state->string_storage = string_storage.ok; @@ -383,7 +383,7 @@ static void initialize_profiles(stack_recorder_state *state, ddog_prof_Slice_Val ddog_prof_Profile_with_string_storage(sample_types, NULL /* period is optional */, state->string_storage); if (slot_one_profile_result.tag == DDOG_PROF_PROFILE_NEW_RESULT_ERR) { - rb_raise(rb_eRuntimeError, "Failed to initialize slot one profile: %"PRIsVALUE, get_error_details_and_drop(&slot_one_profile_result.err)); + raise_error(eNativeRuntimeError, "Failed to initialize slot one profile: %"PRIsVALUE, get_error_details_and_drop(&slot_one_profile_result.err)); } state->profile_slot_one = (profile_slot) { .profile = slot_one_profile_result.ok, .start_timestamp = start_timestamp }; @@ -393,7 +393,7 @@ static void initialize_profiles(stack_recorder_state *state, ddog_prof_Slice_Val if (slot_two_profile_result.tag == DDOG_PROF_PROFILE_NEW_RESULT_ERR) { // Note: No need to take any special care of slot one, it'll get cleaned up by stack_recorder_typed_data_free - rb_raise(rb_eRuntimeError, "Failed to initialize slot two profile: %"PRIsVALUE, get_error_details_and_drop(&slot_two_profile_result.err)); + raise_error(eNativeRuntimeError, "Failed to initialize slot two profile: %"PRIsVALUE, get_error_details_and_drop(&slot_two_profile_result.err)); } state->profile_slot_two = (profile_slot) { .profile = slot_two_profile_result.ok, .start_timestamp = start_timestamp }; @@ -591,7 +591,7 @@ static VALUE _native_serialize(DDTRACE_UNUSED VALUE _self, VALUE recorder_instan ddog_prof_MaybeError result = args.advance_gen_result; if (result.tag == DDOG_PROF_OPTION_ERROR_SOME_ERROR) { - rb_raise(rb_eRuntimeError, "Failed to advance string storage gen: %"PRIsVALUE, get_error_details_and_drop(&result.some)); + raise_error(eNativeRuntimeError, "Failed to advance string storage gen: %"PRIsVALUE, get_error_details_and_drop(&result.some)); } VALUE start = ruby_time_from(args.slot->start_timestamp); @@ -658,7 +658,7 @@ void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations, sampler_unlock_active_profile(active_slot); if (result.tag == DDOG_PROF_PROFILE_RESULT_ERR) { - rb_raise(rb_eArgError, "Failed to record sample: %"PRIsVALUE, get_error_details_and_drop(&result.err)); + raise_error(eNativeArgumentError, "Failed to record sample: %"PRIsVALUE, get_error_details_and_drop(&result.err)); } } @@ -682,7 +682,7 @@ void record_endpoint(VALUE recorder_instance, uint64_t local_root_span_id, ddog_ sampler_unlock_active_profile(active_slot); if (result.tag == DDOG_PROF_PROFILE_RESULT_ERR) { - rb_raise(rb_eArgError, "Failed to record endpoint: %"PRIsVALUE, get_error_details_and_drop(&result.err)); + raise_error(eNativeArgumentError, "Failed to record endpoint: %"PRIsVALUE, get_error_details_and_drop(&result.err)); } } @@ -770,10 +770,10 @@ static void build_heap_profile_without_gvl(stack_recorder_state *state, profile_ // same locks are acquired by the heap recorder while holding the gvl (since we'd be operating on the // same locks but acquiring them in different order). if (!iterated) { - grab_gvl_and_raise(rb_eRuntimeError, "Failure during heap profile building: iteration cancelled"); + grab_gvl_and_raise(eNativeRuntimeError, "Failure during heap profile building: iteration cancelled"); } else if (iteration_context.error) { - grab_gvl_and_raise(rb_eRuntimeError, "Failure during heap profile building: %s", iteration_context.error_msg); + grab_gvl_and_raise(eNativeRuntimeError, "Failure during heap profile building: %s", iteration_context.error_msg); } } @@ -824,7 +824,7 @@ static locked_profile_slot sampler_lock_active_profile(stack_recorder_state *sta } // We already tried both multiple times, and we did not succeed. This is not expected to happen. Let's stop sampling. - rb_raise(rb_eRuntimeError, "Failed to grab either mutex in sampler_lock_active_profile"); + raise_error(eNativeRuntimeError, "Failed to grab either mutex in sampler_lock_active_profile"); } static void sampler_unlock_active_profile(locked_profile_slot active_slot) { @@ -835,7 +835,7 @@ static profile_slot* serializer_flip_active_and_inactive_slots(stack_recorder_st int previously_active_slot = state->active_slot; if (previously_active_slot != 1 && previously_active_slot != 2) { - grab_gvl_and_raise(rb_eRuntimeError, "Unexpected active_slot state %d in serializer_flip_active_and_inactive_slots", previously_active_slot); + grab_gvl_and_raise(eNativeRuntimeError, "Unexpected active_slot state %d in serializer_flip_active_and_inactive_slots", previously_active_slot); } pthread_mutex_t *previously_active = (previously_active_slot == 1) ? &state->mutex_slot_one : &state->mutex_slot_two; @@ -889,7 +889,7 @@ static VALUE test_slot_mutex_state(VALUE recorder_instance, int slot) { return Qtrue; } else { ENFORCE_SUCCESS_GVL(error); - rb_raise(rb_eRuntimeError, "Failed to raise exception in test_slot_mutex_state; this should never happen"); + raise_error(eNativeRuntimeError, "Failed to raise exception in test_slot_mutex_state; this should never happen"); } } @@ -941,7 +941,7 @@ static VALUE _native_track_object(DDTRACE_UNUSED VALUE _self, VALUE recorder_ins static void reset_profile_slot(profile_slot *slot, ddog_Timespec start_timestamp) { ddog_prof_Profile_Result reset_result = ddog_prof_Profile_reset(&slot->profile); if (reset_result.tag == DDOG_PROF_PROFILE_RESULT_ERR) { - rb_raise(rb_eRuntimeError, "Failed to reset profile: %"PRIsVALUE, get_error_details_and_drop(&reset_result.err)); + raise_error(eNativeRuntimeError, "Failed to reset profile: %"PRIsVALUE, get_error_details_and_drop(&reset_result.err)); } slot->start_timestamp = start_timestamp; slot->stats = (stats_slot) {}; @@ -1056,14 +1056,14 @@ static VALUE _native_test_managed_string_storage_produces_valid_profiles(DDTRACE ddog_prof_ManagedStringStorageNewResult string_storage = ddog_prof_ManagedStringStorage_new(); if (string_storage.tag == DDOG_PROF_MANAGED_STRING_STORAGE_NEW_RESULT_ERR) { - rb_raise(rb_eRuntimeError, "Failed to initialize string storage: %"PRIsVALUE, get_error_details_and_drop(&string_storage.err)); + raise_error(eNativeRuntimeError, "Failed to initialize string storage: %"PRIsVALUE, get_error_details_and_drop(&string_storage.err)); } ddog_prof_Slice_ValueType sample_types = {.ptr = all_value_types, .len = ALL_VALUE_TYPES_COUNT}; ddog_prof_Profile_NewResult profile = ddog_prof_Profile_with_string_storage(sample_types, NULL, string_storage.ok); if (profile.tag == DDOG_PROF_PROFILE_NEW_RESULT_ERR) { - rb_raise(rb_eRuntimeError, "Failed to initialize profile: %"PRIsVALUE, get_error_details_and_drop(&profile.err)); + raise_error(eNativeRuntimeError, "Failed to initialize profile: %"PRIsVALUE, get_error_details_and_drop(&profile.err)); } ddog_prof_ManagedStringId hello = intern_or_raise(string_storage.ok, DDOG_CHARSLICE_C("hello")); @@ -1097,7 +1097,7 @@ static VALUE _native_test_managed_string_storage_produces_valid_profiles(DDTRACE ); if (result.tag == DDOG_PROF_PROFILE_RESULT_ERR) { - rb_raise(rb_eArgError, "Failed to record sample: %"PRIsVALUE, get_error_details_and_drop(&result.err)); + raise_error(eNativeArgumentError, "Failed to record sample: %"PRIsVALUE, get_error_details_and_drop(&result.err)); } ddog_Timespec finish_timestamp = system_epoch_now_timespec(); @@ -1105,13 +1105,13 @@ static VALUE _native_test_managed_string_storage_produces_valid_profiles(DDTRACE ddog_prof_Profile_SerializeResult serialize_result = ddog_prof_Profile_serialize(&profile.ok, &start_timestamp, &finish_timestamp); if (serialize_result.tag == DDOG_PROF_PROFILE_SERIALIZE_RESULT_ERR) { - rb_raise(rb_eRuntimeError, "Failed to serialize: %"PRIsVALUE, get_error_details_and_drop(&serialize_result.err)); + raise_error(eNativeRuntimeError, "Failed to serialize: %"PRIsVALUE, get_error_details_and_drop(&serialize_result.err)); } ddog_prof_MaybeError advance_gen_result = ddog_prof_ManagedStringStorage_advance_gen(string_storage.ok); if (advance_gen_result.tag == DDOG_PROF_OPTION_ERROR_SOME_ERROR) { - rb_raise(rb_eRuntimeError, "Failed to advance string storage gen: %"PRIsVALUE, get_error_details_and_drop(&advance_gen_result.some)); + raise_error(eNativeRuntimeError, "Failed to advance string storage gen: %"PRIsVALUE, get_error_details_and_drop(&advance_gen_result.some)); } VALUE encoded_pprof_1 = from_ddog_prof_EncodedProfile(serialize_result.ok); @@ -1127,13 +1127,13 @@ static VALUE _native_test_managed_string_storage_produces_valid_profiles(DDTRACE ); if (result.tag == DDOG_PROF_PROFILE_RESULT_ERR) { - rb_raise(rb_eArgError, "Failed to record sample: %"PRIsVALUE, get_error_details_and_drop(&result.err)); + raise_error(eNativeArgumentError, "Failed to record sample: %"PRIsVALUE, get_error_details_and_drop(&result.err)); } serialize_result = ddog_prof_Profile_serialize(&profile.ok, &start_timestamp, &finish_timestamp); if (serialize_result.tag == DDOG_PROF_PROFILE_SERIALIZE_RESULT_ERR) { - rb_raise(rb_eArgError, "Failed to serialize: %"PRIsVALUE, get_error_details_and_drop(&serialize_result.err)); + raise_error(eNativeArgumentError, "Failed to serialize: %"PRIsVALUE, get_error_details_and_drop(&serialize_result.err)); } VALUE encoded_pprof_2 = from_ddog_prof_EncodedProfile(serialize_result.ok); diff --git a/ext/datadog_profiling_native_extension/unsafe_api_calls_check.c b/ext/datadog_profiling_native_extension/unsafe_api_calls_check.c index c3c23b7f1c7..bb407a68978 100644 --- a/ext/datadog_profiling_native_extension/unsafe_api_calls_check.c +++ b/ext/datadog_profiling_native_extension/unsafe_api_calls_check.c @@ -3,6 +3,7 @@ #include #include "datadog_ruby_common.h" +#include "ruby_helpers.h" #include "unsafe_api_calls_check.h" #include "extconf.h" @@ -21,7 +22,7 @@ void unsafe_api_calls_check_init(void) { check_for_unsafe_api_calls_handle = rb_postponed_job_preregister(unused_flags, check_for_unsafe_api_calls, NULL); if (check_for_unsafe_api_calls_handle == POSTPONED_JOB_HANDLE_INVALID) { - rb_raise(rb_eRuntimeError, "Failed to register check_for_unsafe_api_calls_handle postponed job (got POSTPONED_JOB_HANDLE_INVALID)"); + raise_error(eNativeRuntimeError, "Failed to register check_for_unsafe_api_calls_handle postponed job (got POSTPONED_JOB_HANDLE_INVALID)"); } #endif } diff --git a/ext/libdatadog_api/datadog_ruby_common.c b/ext/libdatadog_api/datadog_ruby_common.c index ca402dca264..461fd245a7f 100644 --- a/ext/libdatadog_api/datadog_ruby_common.c +++ b/ext/libdatadog_api/datadog_ruby_common.c @@ -1,4 +1,5 @@ #include "datadog_ruby_common.h" +#include // IMPORTANT: Currently this file is copy-pasted between extensions. Make sure to update all versions when doing any change! @@ -18,6 +19,14 @@ void raise_unexpected_type(VALUE value, const char *value_name, const char *type ); } +void raise_error(VALUE error_class, const char *fmt, ...) { + va_list args; + va_start(args, fmt); + VALUE message = rb_vsprintf(fmt, args); + va_end(args); + rb_raise(error_class, "%"PRIsVALUE, message); +} + VALUE datadog_gem_version(void) { VALUE ddtrace_module = rb_const_get(rb_cObject, rb_intern("Datadog")); ENFORCE_TYPE(ddtrace_module, T_MODULE); diff --git a/ext/libdatadog_api/datadog_ruby_common.h b/ext/libdatadog_api/datadog_ruby_common.h index d55c2bb479f..adadee49fb8 100644 --- a/ext/libdatadog_api/datadog_ruby_common.h +++ b/ext/libdatadog_api/datadog_ruby_common.h @@ -32,6 +32,9 @@ NORETURN(void raise_unexpected_type(VALUE value, const char *value_name, const char *type_name, const char *file, int line, const char* function_name)); +// Helper to raise errors with formatted messages +NORETURN(void raise_error(VALUE error_class, const char *fmt, ...)); + // Helper to retrieve Datadog::VERSION::STRING VALUE datadog_gem_version(void); diff --git a/gemfiles/ruby_3.4_opentelemetry.gemfile.lock b/gemfiles/ruby_3.4_opentelemetry.gemfile.lock index ec9e4bcaa5f..1171237dc70 100644 --- a/gemfiles/ruby_3.4_opentelemetry.gemfile.lock +++ b/gemfiles/ruby_3.4_opentelemetry.gemfile.lock @@ -34,6 +34,7 @@ GEM docile (1.4.1) dogstatsd-ruby (5.6.1) ffi (1.17.2-aarch64-linux-gnu) + ffi (1.17.2-arm64-darwin) ffi (1.17.2-x86_64-linux-gnu) google-protobuf (3.25.3) hashdiff (1.1.0) @@ -48,6 +49,8 @@ GEM libdatadog (24.0.1.1.0-x86_64-linux) libddwaf (1.30.0.0.0-aarch64-linux) ffi (~> 1.0) + libddwaf (1.30.0.0.0-arm64-darwin) + ffi (~> 1.0) libddwaf (1.30.0.0.0-x86_64-linux) ffi (~> 1.0) logger (1.7.0) @@ -125,6 +128,7 @@ GEM PLATFORMS aarch64-linux + arm64-darwin-23 x86_64-linux DEPENDENCIES diff --git a/gemfiles/ruby_3.4_opentelemetry_otlp.gemfile.lock b/gemfiles/ruby_3.4_opentelemetry_otlp.gemfile.lock index 0a7645ad0ba..566945448a0 100644 --- a/gemfiles/ruby_3.4_opentelemetry_otlp.gemfile.lock +++ b/gemfiles/ruby_3.4_opentelemetry_otlp.gemfile.lock @@ -34,6 +34,7 @@ GEM docile (1.4.1) dogstatsd-ruby (5.6.2) ffi (1.17.2-aarch64-linux-gnu) + ffi (1.17.2-arm64-darwin) ffi (1.17.2-x86_64-linux-gnu) google-protobuf (3.25.5) googleapis-common-protos-types (1.16.0) @@ -46,10 +47,13 @@ GEM reline (>= 0.4.2) json-schema (2.8.1) addressable (>= 2.4) + libdatadog (24.0.1.1.0) libdatadog (24.0.1.1.0-aarch64-linux) libdatadog (24.0.1.1.0-x86_64-linux) libddwaf (1.30.0.0.0-aarch64-linux) ffi (~> 1.0) + libddwaf (1.30.0.0.0-arm64-darwin) + ffi (~> 1.0) libddwaf (1.30.0.0.0-x86_64-linux) ffi (~> 1.0) logger (1.7.0) @@ -132,6 +136,7 @@ GEM PLATFORMS aarch64-linux + arm64-darwin-23 x86_64-linux DEPENDENCIES diff --git a/gemfiles/ruby_3.4_relational_db.gemfile.lock b/gemfiles/ruby_3.4_relational_db.gemfile.lock index f446ab977ba..7cc95f4523e 100644 --- a/gemfiles/ruby_3.4_relational_db.gemfile.lock +++ b/gemfiles/ruby_3.4_relational_db.gemfile.lock @@ -49,6 +49,7 @@ GEM docile (1.4.1) dogstatsd-ruby (5.6.3) ffi (1.17.2-aarch64-linux-gnu) + ffi (1.17.2-arm64-darwin) ffi (1.17.2-x86_64-linux-gnu) google-protobuf (3.25.5) hashdiff (1.1.2) @@ -65,6 +66,8 @@ GEM libdatadog (24.0.1.1.0-x86_64-linux) libddwaf (1.30.0.0.0-aarch64-linux) ffi (~> 1.0) + libddwaf (1.30.0.0.0-arm64-darwin) + ffi (~> 1.0) libddwaf (1.30.0.0.0-x86_64-linux) ffi (~> 1.0) logger (1.7.0) @@ -141,6 +144,7 @@ GEM PLATFORMS aarch64-linux + arm64-darwin-23 x86_64-linux DEPENDENCIES @@ -183,4 +187,4 @@ DEPENDENCIES zstd-ruby BUNDLED WITH - 2.7.2 + 2.7.1 diff --git a/lib/datadog/core/telemetry/logging.rb b/lib/datadog/core/telemetry/logging.rb index 036bb78d8f4..ce5f431c46c 100644 --- a/lib/datadog/core/telemetry/logging.rb +++ b/lib/datadog/core/telemetry/logging.rb @@ -49,7 +49,13 @@ def report(exception, level: :error, description: nil) # Anonymous exceptions to be logged as message = +"#{exception.class.name || exception.class.inspect}" # standard:disable Style/RedundantInterpolation - message << ": #{description}" if description + telemetry_message = message_for_telemety(exception) + + if description || telemetry_message + message << ':' + message << " #{description}" if description + message << " (#{telemetry_message})" if telemetry_message + end event = Event::Log.new( message: message, @@ -65,6 +71,20 @@ def error(description) log!(event) end + + private + + # A constant string representing the exception + def message_for_telemety(exception) + if exception.respond_to?(:telemetry_message) + exception.telemetry_message + # For the 150+ `Errno` error classes, we set the telemetry message as an instance variable. + # We don't want to create/modify such a large number of classes for a small feature. + # Note: Errno classes are subclasses of SystemCallError. + elsif exception.is_a?(SystemCallError) && exception.instance_variable_defined?(:@telemetry_message) + exception.instance_variable_get(:@telemetry_message) + end + end end end end diff --git a/lib/datadog/profiling.rb b/lib/datadog/profiling.rb index 7f751528589..b5f56628d8c 100644 --- a/lib/datadog/profiling.rb +++ b/lib/datadog/profiling.rb @@ -75,6 +75,25 @@ def self.wait_until_running(timeout_seconds: 5) raise 'Profiler not enabled or available' end end + # Base error type for exceptions raised by our native extensions. + # These errors have both the original error message and a telemetry-safe message. + # The telemetry-safe message is statically defined and does not possess dynamic data. + module NativeError + attr_reader :telemetry_message + def initialize(message, telemetry_message = nil) + super(message) + @telemetry_message = telemetry_message + end + end + class NativeRuntimeError < RuntimeError + prepend NativeError + end + class NativeArgumentError < ArgumentError + prepend NativeError + end + class NativeTypeError < TypeError + prepend NativeError + end private_class_method def self.replace_noop_allocation_count class << self diff --git a/lib/datadog/profiling/stack_recorder.rb b/lib/datadog/profiling/stack_recorder.rb index 6c7d7b7b81c..005ff2a035c 100644 --- a/lib/datadog/profiling/stack_recorder.rb +++ b/lib/datadog/profiling/stack_recorder.rb @@ -88,7 +88,7 @@ def serialize! else error_message = result - raise("Failed to serialize profiling data: #{error_message}") + raise "Failed to serialize profiling data: #{error_message}" end end diff --git a/sig/datadog/core/telemetry/logging.rbs b/sig/datadog/core/telemetry/logging.rbs index f64aa023f3c..0e155ebecf9 100644 --- a/sig/datadog/core/telemetry/logging.rbs +++ b/sig/datadog/core/telemetry/logging.rbs @@ -15,6 +15,8 @@ module Datadog def report: (Exception exception, ?level: Symbol, ?description: String?) -> void def error: (String description) -> void + private + def constant_exception_message: (Exception exception) -> String? end end end diff --git a/sig/datadog/profiling.rbs b/sig/datadog/profiling.rbs index 88a8289d082..29037370978 100644 --- a/sig/datadog/profiling.rbs +++ b/sig/datadog/profiling.rbs @@ -1,5 +1,11 @@ module Datadog module Profiling + class NativeError < RuntimeError + attr_reader telemetry_message: String? + def initialize: (*untyped args, **untyped kwargs) -> void + end + ProfilingError: singleton(NativeError) + ProfilingInternalError: singleton(NativeError) def self.supported?: () -> bool def self.unsupported_reason: () -> ::String? def self.start_if_enabled: () -> bool diff --git a/spec/datadog/core/telemetry/logging_spec.rb b/spec/datadog/core/telemetry/logging_spec.rb index 363bc1b380f..d3c5f745c2a 100644 --- a/spec/datadog/core/telemetry/logging_spec.rb +++ b/spec/datadog/core/telemetry/logging_spec.rb @@ -82,6 +82,126 @@ def log!(_event) end end end + context 'with an Erro error carrying telemetry message' do + subject(:report) do + raise Errno::ENOENT, 'Dynamic runtime message' + rescue SystemCallError => error + # This is normally done by native extensions, when raising Errno errors + error.instance_variable_set(:@telemetry_message, 'Safe message') + + component.report(error, level: :error, description: description) + end + + let(:description) { nil } + + it 'includes the telemetry message but not the dynamic exception message' do + expect(component).to receive(:log!).with(instance_of(Datadog::Core::Telemetry::Event::Log)) do |event| + expect(event.payload).to include( + logs: [{message: 'Errno::ENOENT: (Safe message)', level: 'ERROR', count: 1, + stack_trace: a_string_including('REDACTED')}] + ) + expect(event.payload[:logs].first[:message]).not_to include('Dynamic runtime message') + end + + report + end + + context 'with description' do + let(:description) { 'Operation failed' } + + it 'includes both description and telemetry message' do + expect(component).to receive(:log!).with(instance_of(Datadog::Core::Telemetry::Event::Log)) do |event| + expect(event.payload).to include( + logs: [{message: 'Errno::ENOENT: Operation failed (Safe message)', level: 'ERROR', count: 1, + stack_trace: a_string_including('REDACTED')}] + ) + end + + report + end + end + end + context 'with NativeError' do + before do + skip unless defined?(Datadog::Profiling::NativeError) + end + it 'includes the telemetry-safe message in telemetry' do + expect(component).to receive(:log!).with(instance_of(Datadog::Core::Telemetry::Event::Log)) do |event| + expect(event.payload).to include( + logs: [{message: 'Datadog::Profiling::NativeRuntimeError: (This is a safe profiler error)', level: 'ERROR', count: 1, + stack_trace: a_string_including('REDACTED')}] + ) + end + begin + raise Datadog::Profiling::NativeRuntimeError.new('This is a safe profiler error', telemetry_message: 'This is a safe profiler error') + rescue => e + component.report(e, level: :error) + end + end + context 'with description' do + it 'includes both description and telemetry message' do + expect(component).to receive(:log!).with(instance_of(Datadog::Core::Telemetry::Event::Log)) do |event| + expect(event.payload).to include( + logs: [{message: 'Datadog::Profiling::NativeRuntimeError: Profiler failed to start (Failed to initialize native extension)', level: 'ERROR', count: 1, + stack_trace: a_string_including('REDACTED')}] + ) + end + begin + raise Datadog::Profiling::NativeRuntimeError.new('Failed to initialize native extension', telemetry_message: 'Failed to initialize native extension') + rescue => e + component.report(e, level: :error, description: 'Profiler failed to start') + end + end + end + context 'without telemetry message' do + it 'omits the dynamic exception message from telemetry' do + expect(component).to receive(:log!).with(instance_of(Datadog::Core::Telemetry::Event::Log)) do |event| + expect(event.payload).to include( + logs: [{message: 'Datadog::Profiling::NativeRuntimeError', level: 'ERROR', count: 1, + stack_trace: a_string_including('REDACTED')}] + ) + expect(event.payload[:logs].map { |log| log[:message] }).not_to include(/Failed to initialize.*0x[0-9a-f]+/) + end + begin + raise Datadog::Profiling::NativeRuntimeError, 'Failed to initialize string storage: Error at address 0xdeadbeef' + rescue => e + component.report(e, level: :error) + end + end + end + context 'with telemetry message and dynamic content' do + it 'includes only the telemetry-safe message' do + expect(component).to receive(:log!).with(instance_of(Datadog::Core::Telemetry::Event::Log)) do |event| + expect(event.payload).to include( + logs: [{message: 'Datadog::Profiling::NativeRuntimeError: (Static format string)', level: 'ERROR', count: 1, + stack_trace: a_string_including('REDACTED')}] + ) + expect(event.payload[:logs].map { |log| log[:message] }).not_to include('Dynamic info 0xabc123') + end + begin + raise Datadog::Profiling::NativeRuntimeError.new('Static format string', 'Dynamic info 0xabc123') + rescue => e + component.report(e, level: :error) + end + end + end + context 'with description and dynamic content' do + it 'includes the description but not the dynamic exception message' do + expect(component).to receive(:log!).with(instance_of(Datadog::Core::Telemetry::Event::Log)) do |event| + expect(event.payload).to include( + logs: [{message: 'Datadog::Profiling::NativeRuntimeError: libdatadog internal error', level: 'ERROR', count: 1, + stack_trace: a_string_including('REDACTED')}] + ) + expect(event.payload[:logs].map { |log| log[:message] }).not_to include(/memory address/) + end + begin + raise Datadog::Profiling::NativeRuntimeError, 'Failed to serialize profile: Invalid memory address 0x12345678' + rescue => e + component.report(e, level: :error, description: 'libdatadog internal error') + end + end + end + end end describe '.error' do diff --git a/spec/datadog/profiling/collectors/code_provenance_spec.rb b/spec/datadog/profiling/collectors/code_provenance_spec.rb index ce111a7de88..906f3b26d33 100644 --- a/spec/datadog/profiling/collectors/code_provenance_spec.rb +++ b/spec/datadog/profiling/collectors/code_provenance_spec.rb @@ -21,6 +21,8 @@ it "records libraries that are currently loaded" do refresh + platform_fragment = RUBY_PLATFORM + hyphenated_platform_fragment = platform_fragment.sub(/darwin(\d+)/, 'darwin-\1') expect(generate_result).to include( { kind: "standard library", @@ -34,7 +36,10 @@ version: Datadog::VERSION::STRING, paths: contain_exactly( start_with("/"), - include("extensions").and(include(RUBY_PLATFORM)), + satisfy do |path| + path.include?("extensions") && + (path.include?(platform_fragment) || path.include?(hyphenated_platform_fragment)) + end, "#{Gem.bindir}/ddprofrb", "#{Bundler.bin_path}/ddprofrb", ), @@ -50,6 +55,8 @@ it "includes the native extension directory for gems with native extensions" do refresh + platform_fragment = RUBY_PLATFORM + hyphenated_platform_fragment = platform_fragment.sub(/darwin(\d+)/, 'darwin-\1') expect(generate_result.find { |it| it[:name] == "msgpack" }).to include( { @@ -58,7 +65,10 @@ version: MessagePack::VERSION, paths: contain_exactly( satisfy { |it| it.start_with?(Gem.dir) && !it.include?("extensions") }, - include("extensions").and(include(RUBY_PLATFORM)), + satisfy do |path| + path.include?("extensions") && + (path.include?(platform_fragment) || path.include?(hyphenated_platform_fragment)) + end, ), } ) diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index 408abf1ddd8..212b8943c4f 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -114,6 +114,7 @@ exception = try_wait_until(backoff: 0.01) { another_instance.send(:failure_exception) } expect(exception.message).to include "another instance" + expect(exception.telemetry_message).to eq("Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread") another_instance.stop end @@ -199,7 +200,9 @@ exception = try_wait_until(backoff: 0.01) { cpu_and_wall_time_worker.send(:failure_exception) } - expect(exception.message).to include "pre-existing SIGPROF" + expect(exception.message).to include "(empty_signal_handler): There's a pre-existing SIGPROF" + # The handler name and signal name are statically check as safe for telemetry + expect(exception.telemetry_message).to include "(empty_signal_handler): There's a pre-existing SIGPROF" end it "leaves the existing signal handler in place" do diff --git a/spec/datadog/profiling/collectors/stack_spec.rb b/spec/datadog/profiling/collectors/stack_spec.rb index 7a9ecf8997a..c499a370c73 100644 --- a/spec/datadog/profiling/collectors/stack_spec.rb +++ b/spec/datadog/profiling/collectors/stack_spec.rb @@ -242,7 +242,7 @@ def call_sleep expect(gathered_stack).to eq reference_stack end - context "when native filenames are enabled" do + context "when native filenames are enabled", if: PlatformHelpers.linux? do let(:native_filenames_enabled) { true } before do @@ -292,7 +292,7 @@ def save_rounding_mode # rubocop:disable Lint/UselessMethodDefinition expect(gathered_stack).to eq reference_stack end - context "when native filenames are enabled" do + context "when native filenames are enabled", if: PlatformHelpers.linux? do let(:native_filenames_enabled) { true } before do @@ -330,7 +330,10 @@ def save_rounding_mode # rubocop:disable Lint/UselessMethodDefinition it do expect { sample_and_decode(background_thread, :labels, is_gvl_waiting_state: true) - }.to raise_error(RuntimeError, /BUG: .* is_gvl_waiting/) + }.to raise_error(Datadog::Profiling::NativeRuntimeError) do |error| + expect(error.message).to match(/BUG: .* is_gvl_waiting/) + expect(error.telemetry_message).to eq('BUG: Unexpected combination of cpu-time with is_gvl_waiting') + end end end @@ -366,7 +369,10 @@ def save_rounding_mode # rubocop:disable Lint/UselessMethodDefinition let(:metric_values) { {"cpu-samples" => 1} } it "raises an exception" do - expect { gathered_stack }.to raise_error(RuntimeError, /BUG: Unexpected missing state_label/) + expect { gathered_stack }.to raise_error(Datadog::Profiling::NativeRuntimeError) do |error| + expect(error.message).to match(/BUG: Unexpected missing state_label/) + expect(error.telemetry_message).to eq('BUG: Unexpected missing state_label') + end end end @@ -825,6 +831,9 @@ def dummy_template.#{method_name}(ready_queue) end describe "_native_filenames_available?" do + before do + skip "Native filename helpers are not available on macOS" if PlatformHelpers.mac? + end context "on linux", if: PlatformHelpers.linux? do it "returns true" do expect(described_class._native_filenames_available?).to be true @@ -839,6 +848,9 @@ def dummy_template.#{method_name}(ready_queue) end describe "_native_ruby_native_filename" do + before do + skip "Native filename helpers are not available on macOS" if PlatformHelpers.mac? + end context "on linux", if: PlatformHelpers.linux? do it "returns the correct filename" do expect(described_class._native_ruby_native_filename).to end_with("/ruby").or(include("libruby.so")) diff --git a/spec/datadog/profiling/collectors/thread_context_spec.rb b/spec/datadog/profiling/collectors/thread_context_spec.rb index dedf7f9b90c..2c91298f7d3 100644 --- a/spec/datadog/profiling/collectors/thread_context_spec.rb +++ b/spec/datadog/profiling/collectors/thread_context_spec.rb @@ -1582,7 +1582,10 @@ def sample_and_check(expected_state:) context "when called before on_gc_start/on_gc_finish" do it do - expect { sample_after_gc(allow_exception: true) }.to raise_error(RuntimeError, /Unexpected call to sample_after_gc/) + expect { sample_after_gc(allow_exception: true) }.to raise_error(Datadog::Profiling::NativeRuntimeError) do |error| + expect(error.message).to match(/Unexpected call to sample_after_gc/) + expect(error.telemetry_message).to eq('BUG: Unexpected call to sample_after_gc without valid GC information available') + end end end @@ -1601,7 +1604,10 @@ def sample_and_check(expected_state:) sample_after_gc expect { sample_after_gc(allow_exception: true) } - .to raise_error(RuntimeError, /Unexpected call to sample_after_gc/) + .to raise_error(Datadog::Profiling::NativeRuntimeError) do |error| + expect(error.message).to match(/Unexpected call to sample_after_gc/) + expect(error.telemetry_message).to eq('BUG: Unexpected call to sample_after_gc without valid GC information available') + end end end @@ -1968,6 +1974,18 @@ def sample_and_check(expected_state:) describe "#sample_after_gvl_running" do before { skip_if_gvl_profiling_not_supported(self) } + context "when timeline is disabled" do + let(:timeline_enabled) { false } + before do + skip "GVL profiling timeline crashes on macOS" if PlatformHelpers.mac? + end + it "raises a telemetry-instrumented error" do + expect { sample_after_gvl_running(t1) }.to raise_error(Datadog::Profiling::NativeRuntimeError) do |error| + expect(error.message).to include("GVL profiling requires timeline to be enabled") + expect(error.telemetry_message).to eq("GVL profiling requires timeline to be enabled") + end + end + end let(:timeline_enabled) { true } context "when thread is not at the end of a Waiting for GVL period" do diff --git a/spec/datadog/profiling/http_transport_spec.rb b/spec/datadog/profiling/http_transport_spec.rb index 8709c9759b0..054e71d9c7b 100644 --- a/spec/datadog/profiling/http_transport_spec.rb +++ b/spec/datadog/profiling/http_transport_spec.rb @@ -14,7 +14,10 @@ # We also have "integration" specs, where we exercise the Ruby code together with the C code and libdatadog to ensure # that things come out of libdatadog as we expected. RSpec.describe Datadog::Profiling::HttpTransport do - before { skip_if_profiling_not_supported(self) } + before do + skip "Profiling HTTP transport integration tests require Linux networking helpers" if PlatformHelpers.mac? + skip_if_profiling_not_supported(self) + end subject(:http_transport) do described_class.new( diff --git a/spec/datadog/profiling/libdatadog_extconf_helpers_spec.rb b/spec/datadog/profiling/libdatadog_extconf_helpers_spec.rb index 3ee75ff2758..bab12a18a92 100644 --- a/spec/datadog/profiling/libdatadog_extconf_helpers_spec.rb +++ b/spec/datadog/profiling/libdatadog_extconf_helpers_spec.rb @@ -11,7 +11,7 @@ before do skip_if_profiling_not_supported(self) if PlatformHelpers.mac? && Libdatadog.pkgconfig_folder.nil? && ENV["LIBDATADOG_VENDOR_OVERRIDE"].nil? - raise "You have a libdatadog setup without macOS support. Did you forget to set LIBDATADOG_VENDOR_OVERRIDE?" + skip "Needs LIBDATADOG_VENDOR_OVERRIDE pointing at a valid libdatadog build on macOS" end end @@ -46,7 +46,7 @@ before do skip_if_profiling_not_supported(self) if PlatformHelpers.mac? && Libdatadog.pkgconfig_folder.nil? && ENV["LIBDATADOG_VENDOR_OVERRIDE"].nil? - raise "You have a libdatadog setup without macOS support. Did you forget to set LIBDATADOG_VENDOR_OVERRIDE?" + skip "Needs LIBDATADOG_VENDOR_OVERRIDE pointing at a valid libdatadog build on macOS" end end diff --git a/spec/datadog/profiling/native_extension_spec.rb b/spec/datadog/profiling/native_extension_spec.rb index b3fa0c8163a..4c6adb1dfbc 100644 --- a/spec/datadog/profiling/native_extension_spec.rb +++ b/spec/datadog/profiling/native_extension_spec.rb @@ -12,27 +12,93 @@ describe "grab_gvl_and_raise" do it "raises the requested exception with the passed in message" do - expect { described_class::Testing._native_grab_gvl_and_raise(ZeroDivisionError, "this is a test", nil, true) } - .to raise_exception(ZeroDivisionError, "this is a test") + expect { described_class::Testing._native_grab_gvl_and_raise(Datadog::Profiling::NativeRuntimeError, "this is a test", nil, true) } + .to raise_native_error(Datadog::Profiling::NativeRuntimeError, "this is a test", "this is a test") end - it "accepts printf-style string formatting" do - expect { described_class::Testing._native_grab_gvl_and_raise(ZeroDivisionError, "divided zero by ", 42, true) } - .to raise_exception(ZeroDivisionError, "divided zero by 42") + it "on printf-style, only report the fixed string for telemetry" do + expect { described_class::Testing._native_grab_gvl_and_raise(Datadog::Profiling::NativeRuntimeError, "message %s", "oops", true) } + .to raise_native_error(Datadog::Profiling::NativeRuntimeError, "message oops", "message %s") end it "limits the exception message to 255 characters" do big_message = "a" * 500 - expect { described_class::Testing._native_grab_gvl_and_raise(ZeroDivisionError, big_message, nil, true) } - .to raise_exception(ZeroDivisionError, /a{255}\z/) + expect { described_class::Testing._native_grab_gvl_and_raise(Datadog::Profiling::NativeRuntimeError, big_message, nil, true) } + .to raise_native_error(Datadog::Profiling::NativeRuntimeError, /a{255}\z/, /a{255}\z/) end context "when called without releasing the gvl" do - it "raises a RuntimeError" do - expect { described_class::Testing._native_grab_gvl_and_raise(ZeroDivisionError, "this is a test", nil, false) } - .to raise_exception(RuntimeError, /called by thread holding the global VM lock/) + it "raises a NativeError" do + expect do + described_class::Testing._native_grab_gvl_and_raise(ZeroDivisionError, "message %s", 'oops', false) + end.to raise_native_error( + Datadog::Profiling::NativeRuntimeError, + include('called by thread holding the global VM lock: message oops (ZeroDivisionError)'), + include('called by thread holding the global VM lock: message %s (ZeroDivisionError)'), + ) + end + end + + context "when raising NativeRuntimeError" do + subject(:raise_native_runtime_error) do + described_class::Testing._native_grab_gvl_and_raise(Datadog::Profiling::NativeRuntimeError, "runtime error test", nil, true) + end + + it "raises a NativeRuntimeError" do + expect { raise_native_runtime_error }.to raise_native_error(Datadog::Profiling::NativeRuntimeError, "runtime error test", "runtime error test") + end + + it "is an instance of RuntimeError" do + expect { raise_native_runtime_error }.to raise_exception(RuntimeError) + end + end + + context "when raising NativeArgumentError" do + subject(:raise_native_argument_error) do + described_class::Testing._native_grab_gvl_and_raise(Datadog::Profiling::NativeArgumentError, "argument error test", nil, true) + end + + it "raises a NativeArgumentError" do + expect { raise_native_argument_error }.to raise_native_error(Datadog::Profiling::NativeArgumentError, "argument error test", "argument error test") end + + it "is an instance of ArgumentError" do + expect { raise_native_argument_error }.to raise_exception(ArgumentError) + end + end + + context "when raising NativeTypeError" do + subject(:raise_native_type_error) do + described_class::Testing._native_grab_gvl_and_raise(Datadog::Profiling::NativeTypeError, "type error test", nil, true) + end + + it "raises a NativeTypeError" do + expect { raise_native_type_error }.to raise_native_error(Datadog::Profiling::NativeTypeError, "type error test", "type error test") + end + + it "is an instance of TypeError" do + expect { raise_native_type_error }.to raise_exception(TypeError) + end + end + end + + describe "raise_native_error_with_invalid_class" do + it "raises a safe error when trying to raise a native error with an unsupported error class" do + expect { described_class::Testing._native_raise_native_error_with_invalid_class(ZeroDivisionError, "original", "telemetry") } + .to raise_native_error( + Datadog::Profiling::NativeArgumentError, + include('exception that might not support two error messages') & include("ZeroDivisionError"), + satisfy do |telemetry_message| + telemetry_message.include?('exception that might not support two error messages') && + !telemetry_message.include?("ZeroDivisionError") + end, + ) + end + + it 'raises original error with a supported error class' do + expect { described_class::Testing._native_raise_native_error_with_invalid_class(Datadog::Profiling::NativeRuntimeError, "original", "telemetry") } + .to raise_native_error(Datadog::Profiling::NativeRuntimeError, "original", "telemetry") end end @@ -45,8 +111,8 @@ it "accepts printf-style string formatting" do expect do - described_class::Testing._native_grab_gvl_and_raise_syserr(Errno::EINTR::Errno, "divided zero by ", 42, true) - end.to raise_exception(Errno::EINTR, "#{Errno::EINTR.exception.message} - divided zero by 42") + described_class::Testing._native_grab_gvl_and_raise_syserr(Errno::EINTR::Errno, "message %s", "oops", true) + end.to raise_exception(Errno::EINTR, "#{Errno::EINTR.exception.message} - message oops") end it "limits the caller-provided exception message to 255 characters" do @@ -58,10 +124,14 @@ end context "when called without releasing the gvl" do - it "raises a RuntimeError" do + it "raises a NativeError, preserving the Errno exception class information" do expect do - described_class::Testing._native_grab_gvl_and_raise_syserr(Errno::EINTR::Errno, "this is a test", nil, false) - end.to raise_exception(RuntimeError, /called by thread holding the global VM lock/) + described_class::Testing._native_grab_gvl_and_raise_syserr(Errno::EINTR::Errno, "message %s", "oops", false) + end.to raise_native_error( + Datadog::Profiling::NativeRuntimeError, + include("called by thread holding the global VM lock. syserr_errno: 4, exception_message: 'message oops'"), + include("called by thread holding the global VM lock. syserr_errno: 4, exception_message: 'message %s'") + ) end end end diff --git a/spec/datadog/profiling/spec_helper.rb b/spec/datadog/profiling/spec_helper.rb index 05423d3f8ab..3bec8effe0c 100644 --- a/spec/datadog/profiling/spec_helper.rb +++ b/spec/datadog/profiling/spec_helper.rb @@ -24,12 +24,12 @@ def skip_if_profiling_not_supported(testcase) # Profiling is not officially supported on macOS due to missing libdatadog binaries, # but it's still useful to allow it to be enabled for development. - if PlatformHelpers.mac? && ENV["DD_PROFILING_MACOS_TESTING"] != "true" - testcase.skip( - "Profiling is not supported on macOS. If you still want to run these specs, you can use " \ - "DD_PROFILING_MACOS_TESTING=true to override this check." - ) - end + # if PlatformHelpers.mac? && ENV["DD_PROFILING_MACOS_TESTING"] != "true" + # testcase.skip( + # "Profiling is not supported on macOS. If you still want to run these specs, you can use " \ + # "DD_PROFILING_MACOS_TESTING=true to override this check." + # ) + # end return if Datadog::Profiling.supported? @@ -116,3 +116,93 @@ def skip_if_gvl_profiling_not_supported(testcase) RSpec.configure do |config| config.include ProfileHelpers end + +RSpec::Matchers.define :raise_native_error do |expected_class, expected_message = nil, expected_telemetry_message = nil, &block| + unless expected_class.is_a?(Class) && expected_class <= Datadog::Profiling::NativeError + raise ArgumentError, "expected_class must be a subclass of Datadog::Profiling::NativeError" + end + + supports_block_expectations + + def describe_expected(value) + if value.respond_to?(:description) && value.description + value.description + else + value.inspect + end + end + + def match_expected?(expected, actual, attribute) + return true if expected.nil? + + actual_description = actual.nil? ? "nil" : actual.inspect + + if expected.respond_to?(:matches?) + result = expected.matches?(actual) + unless result + @failure_message ||= if expected.respond_to?(:failure_message) + expected.failure_message + else + "expected native exception #{attribute} to match #{describe_expected(expected)}, but was #{actual_description}" + end + end + result + elsif expected.is_a?(Regexp) + actual_string = if actual.is_a?(String) + actual + elsif actual.respond_to?(:to_str) + actual.to_str + else + nil + end + result = actual_string && expected.match?(actual_string) + unless result + @failure_message ||= "expected native exception #{attribute} to match #{expected.inspect}, but was #{actual_description}" + end + result + else + result = actual == expected + unless result + @failure_message ||= "expected native exception #{attribute} to equal #{expected.inspect}, but was #{actual_description}" + end + result + end + end + + match do |actual_proc| + @failure_message = nil + @actual_error = nil + + actual_proc.call + false + rescue Datadog::Profiling::NativeError => e + @actual_error = e + + unless e.is_a?(expected_class) + @failure_message = + "expected native exception of class #{expected_class}, but #{e.class} was raised" + return false + end + + message_matches = match_expected?(expected_message, e.message, "message") + telemetry_matches = match_expected?(expected_telemetry_message, e.telemetry_message, "telemetry message") + + return false unless message_matches && telemetry_matches + + block&.call(e) + + true + end + + failure_message do + if @actual_error + @failure_message || + "expected native exception of class #{expected_class} with message #{expected_message.inspect} " \ + "and telemetry message #{expected_telemetry_message.inspect}, but got #{@actual_error.class} " \ + "with message #{@actual_error.message.inspect} and telemetry message #{@actual_error.telemetry_message.inspect}" + else + "expected native exception of class #{expected_class} with message #{expected_message.inspect} " \ + "and telemetry message #{expected_telemetry_message.inspect}, but no exception was raised" + end + end +end diff --git a/spec/datadog/profiling/stack_recorder_spec.rb b/spec/datadog/profiling/stack_recorder_spec.rb index 3a75be9a6c9..a0ed7155745 100644 --- a/spec/datadog/profiling/stack_recorder_spec.rb +++ b/spec/datadog/profiling/stack_recorder_spec.rb @@ -679,7 +679,10 @@ def introduce_distinct_stacktraces(i, obj) expect do Datadog::Profiling::Collectors::Stack::Testing ._native_sample(Thread.current, stack_recorder, metric_values, labels, numeric_labels) - end.to raise_error(RuntimeError, /Ended a heap recording/) + end.to raise_error(Datadog::Profiling::NativeRuntimeError) do |error| + expect(error.message).to include("Ended a heap recording") + expect(error.telemetry_message).to eq("Ended a heap recording that was not started") + end end it "does not keep the active slot mutex locked" do @@ -860,6 +863,51 @@ def sample_and_clear end end end + describe "heap recorder telemetry" do + before { skip "Heap profiling is only supported on Ruby >= 2.7" if RUBY_VERSION < "2.7" } + let(:telemetry_stack_recorder) do + described_class.for_testing( + cpu_time_enabled: cpu_time_enabled, + alloc_samples_enabled: alloc_samples_enabled, + heap_samples_enabled: true, + heap_size_enabled: true, + heap_sample_every: heap_sample_every, + timeline_enabled: timeline_enabled, + heap_clean_after_gc_enabled: heap_clean_after_gc_enabled, + ) + end + after do + telemetry_stack_recorder.reset_after_fork + end + it "raises telemetry when heap tracking starts consecutively" do + described_class::Testing._native_track_object(telemetry_stack_recorder, Object.new, 1, "Object") + expect do + described_class::Testing._native_track_object(telemetry_stack_recorder, Object.new, 1, "Object") + end.to raise_error(Datadog::Profiling::NativeRuntimeError) do |error| + expect(error.message).to include("Detected consecutive heap allocation recording starts without end") + expect(error.telemetry_message).to eq("Detected consecutive heap allocation recording starts without end.") + end + end + it "raises telemetry when finishing without preparing" do + expect do + described_class::Testing._native_end_fake_slow_heap_serialization(telemetry_stack_recorder) + end.to raise_error(Datadog::Profiling::NativeRuntimeError) do |error| + expect(error.message).to include("Heap recorder iteration finished without having been prepared") + expect(error.telemetry_message).to eq("Heap recorder iteration finished without having been prepared.") + end + end + it "raises telemetry when preparing twice in a row" do + described_class::Testing._native_start_fake_slow_heap_serialization(telemetry_stack_recorder) + expect do + described_class::Testing._native_start_fake_slow_heap_serialization(telemetry_stack_recorder) + end.to raise_error(Datadog::Profiling::NativeRuntimeError) do |error| + expect(error.message).to include("New heap recorder iteration prepared without the previous one having been finished") + expect(error.telemetry_message).to eq("New heap recorder iteration prepared without the previous one having been finished.") + end + ensure + described_class::Testing._native_end_fake_slow_heap_serialization(telemetry_stack_recorder) + end + end describe "#serialize!" do subject(:serialize!) { stack_recorder.serialize! }