-- New configuration column "escalate" to provide alert escalation capability. Allows for when a job fails at a certain alert level a given number of times, the alert level in the monitoring is raised to the next level up. (Experimental. Please provide feedback) -- Adjust monitor trigger on job_log table to be able to handle custom alert codes better. -- Included alert_text value in description returned by check_job_status to make it clearer how many of each alert status occurred. -- Fixed pgTAP tests to pass properly and account for new return values. Added tests for escalation. ALTER TABLE @extschema@.job_check_config ADD escalate int; /* * Escalation trigger to cause the alert_code value of job_check_log to be higher than the originally inserted value * if the escalation policy is set and the number of failed jobs exceeds the configured value */ CREATE FUNCTION job_check_log_escalate_trig() RETURNS trigger LANGUAGE plpgsql AS $$ DECLARE v_count int; v_escalate int; v_highest_logged_alert_code int; v_max_alert_code int; v_step_id bigint; BEGIN SELECT escalate INTO v_escalate FROM @extschema@.job_check_config WHERE job_name = NEW.job_name; IF v_escalate IS NOT NULL THEN -- Only get the count of the highest number of failures for an alert code for a specific job SELECT count(job_name) , alert_code INTO v_count , v_highest_logged_alert_code FROM @extschema@.job_check_log WHERE job_name = NEW.job_name GROUP BY job_name, alert_code ORDER BY alert_code DESC LIMIT 1 ; -- Ensure new alert codes are always equal to at least the last escalated value IF v_highest_logged_alert_code > NEW.alert_code THEN NEW.alert_code = v_highest_logged_alert_code; END IF; -- +1 to ensure the insertion that matches the v_escalate value triggers the escalation, not the next insertion IF v_count + 1 >= v_escalate THEN SELECT max(alert_code) INTO v_max_alert_code FROM @extschema@.job_status_text; IF NEW.alert_code < v_max_alert_code THEN -- Don't exceed the highest configured alert code NEW.alert_code = NEW.alert_code + 1; -- Log that alert code was escalated by the last job that failed EXECUTE 'SELECT @extschema@.add_step('||NEW.job_id||', ''ALERT ESCALATION'')' INTO v_step_id; EXECUTE 'SELECT @extschema@.update_step('||v_step_id||', ''ESCALATE'', ''Job has alerted at level '||NEW.alert_code - 1 ||' in excess of the escalate value configured for this job ('||v_escalate|| '). Alert code value has been escaleted to: '||NEW.alert_code||''')'; EXECUTE 'UPDATE @extschema@.job_log SET status = ''ESCALATED'' WHERE job_id = '||NEW.job_id; END IF; END IF; END IF; RETURN NEW; END $$; CREATE TRIGGER job_check_log_escalate_trig BEFORE INSERT ON @extschema@.job_check_log FOR EACH ROW EXECUTE PROCEDURE job_check_log_escalate_trig(); /* * Job Monitor Trigger */ CREATE OR REPLACE FUNCTION job_monitor() RETURNS trigger LANGUAGE plpgsql AS $$ DECLARE v_alert_code int; BEGIN SELECT alert_code INTO v_alert_code FROM @extschema@.job_status_text WHERE alert_text = NEW.status; IF v_alert_code IS NOT NULL THEN IF v_alert_code = 1 THEN DELETE FROM @extschema@.job_check_log WHERE job_name = NEW.job_name; ELSE INSERT INTO @extschema@.job_check_log (job_id, job_name, alert_code) VALUES (NEW.job_id, NEW.job_name, v_alert_code); END IF; END IF; RETURN NULL; END $$; /* * Check Job status * * p_history is how far into job_log's past the check will go. Don't go further back than the longest job's interval that is contained * in job_check_config to keep check efficient * Return code 1 means a successful job run * Return code 2 is for use with jobs that support a warning indicator. Not critical, but someone should look into it * Return code 3 is for use with a critical job failure */ CREATE OR REPLACE FUNCTION check_job_status(p_history interval, OUT alert_code int, OUT alert_status text, OUT job_name text, OUT alert_text text) RETURNS SETOF record LANGUAGE plpgsql AS $$ DECLARE v_count int = 1; v_longest_period interval; v_row record; v_rowcount int; v_problem_count int := 0; v_version int; BEGIN -- Leave this check here in case helper function isn't used and this is called directly with an interval argument SELECT greatest(max(error_threshold), max(warn_threshold)) INTO v_longest_period FROM @extschema@.job_check_config; IF v_longest_period IS NOT NULL THEN IF p_history < v_longest_period THEN RAISE EXCEPTION 'Input argument must be greater than or equal to the longest threshold in job_check_config table'; END IF; END IF; SELECT current_setting('server_version_num')::int INTO v_version; CREATE TEMP TABLE jobmon_check_job_status_temp (alert_code int, alert_status text, job_name text, alert_text text, pid int); -- Check for jobs with three consecutive errors and not set for any special configuration INSERT INTO jobmon_check_job_status_temp (alert_code, alert_status, job_name, alert_text) SELECT l.alert_code, 'FAILED_RUN' AS alert_status, l.job_name, '3 consecutive '||t.alert_text||' runs' AS alert_text FROM @extschema@.job_check_log l JOIN @extschema@.job_status_text t ON l.alert_code = t.alert_code WHERE l.job_name NOT IN ( SELECT c.job_name FROM @extschema@.job_check_config c ) GROUP BY l.job_name, l.alert_code, t.alert_text HAVING count(*) > 2; GET DIAGNOSTICS v_rowcount = ROW_COUNT; IF v_rowcount IS NOT NULL AND v_rowcount > 0 THEN v_problem_count := v_problem_count + 1; END IF; -- Check for jobs with specially configured sensitivity INSERT INTO jobmon_check_job_status_temp (alert_code, alert_status, job_name, alert_text) SELECT l.alert_code, 'FAILED_RUN' as alert_status, l.job_name, count(*)||' '||t.alert_text||' run(s)' AS alert_text FROM @extschema@.job_check_log l JOIN @extschema@.job_check_config c ON l.job_name = c.job_name JOIN @extschema@.job_status_text t ON l.alert_code = t.alert_code GROUP BY l.job_name, l.alert_code, t.alert_text, c.sensitivity HAVING count(*) > c.sensitivity; GET DIAGNOSTICS v_rowcount = ROW_COUNT; IF v_rowcount IS NOT NULL AND v_rowcount > 0 THEN v_problem_count := v_problem_count + 1; END IF; -- Check for missing jobs that have configured time thresholds. Jobs that have not run since before the p_history will return pid as NULL INSERT INTO jobmon_check_job_status_temp (alert_code, alert_status, job_name, alert_text, pid) SELECT CASE WHEN l.max_start IS NULL AND l.end_time IS NULL THEN 3 WHEN (CURRENT_TIMESTAMP - l.max_start) > c.error_threshold THEN 3 WHEN (CURRENT_TIMESTAMP - l.max_start) > c.warn_threshold THEN 2 ELSE 3 END AS ac , CASE WHEN (CURRENT_TIMESTAMP - l.max_start) > c.warn_threshold OR l.end_time IS NULL THEN 'MISSING' ELSE l.status END AS alert_status , c.job_name , COALESCE('Last completed run: '||l.max_end, 'Has not completed a run since highest configured monitoring time period') AS alert_text , l.pid FROM @extschema@.job_check_config c LEFT JOIN ( WITH max_start_time AS ( SELECT w.job_name, max(w.start_time) as max_start, max(w.end_time) as max_end FROM @extschema@.job_log w WHERE start_time > (CURRENT_TIMESTAMP - p_history) GROUP BY w.job_name) SELECT a.job_name, a.end_time, a.status, a.pid, m.max_start, m.max_end FROM @extschema@.job_log a JOIN max_start_time m ON a.job_name = m.job_name and a.start_time = m.max_start WHERE start_time > (CURRENT_TIMESTAMP - p_history) ) l ON c.job_name = l.job_name WHERE c.active AND (CURRENT_TIMESTAMP - l.max_start) > c.warn_threshold OR l.max_start IS NULL ORDER BY ac, l.job_name, l.max_start; GET DIAGNOSTICS v_rowcount = ROW_COUNT; IF v_rowcount IS NOT NULL AND v_rowcount > 0 THEN v_problem_count := v_problem_count + 1; END IF; -- Check for BLOCKED after RUNNING to ensure blocked jobs are labelled properly IF v_version >= 90200 THEN -- Jobs currently running that have not run before within their configured monitoring time period FOR v_row IN SELECT j.job_name FROM @extschema@.job_log j JOIN @extschema@.job_check_config c ON j.job_name = c.job_name JOIN pg_catalog.pg_stat_activity a ON j.pid = a.pid WHERE j.start_time > (CURRENT_TIMESTAMP - p_history) AND (CURRENT_TIMESTAMP - j.start_time) >= least(c.warn_threshold, c.error_threshold) AND j.end_time IS NULL LOOP UPDATE jobmon_check_job_status_temp t SET alert_status = 'RUNNING' , alert_text = (SELECT COALESCE('Currently running. Last completed run: '||max(end_time), 'Currently running. Job has not had a completed run within configured monitoring time period.') FROM @extschema@.job_log WHERE job_log.job_name = v_row.job_name AND job_log.start_time > (CURRENT_TIMESTAMP - p_history)) WHERE t.job_name = v_row.job_name; END LOOP; -- Jobs blocked by locks FOR v_row IN SELECT j.job_name FROM @extschema@.job_log j JOIN pg_catalog.pg_locks l ON j.pid = l.pid JOIN pg_catalog.pg_stat_activity a ON j.pid = a.pid WHERE j.start_time > (CURRENT_TIMESTAMP - p_history) AND NOT l.granted LOOP UPDATE jobmon_check_job_status_temp t SET alert_status = 'BLOCKED' , alert_text = COALESCE('Another transaction has a lock that blocking this job from completing') WHERE t.job_name = v_row.job_name; END LOOP; ELSE -- version less than 9.2 with old procpid column -- Jobs currently running that have not run before within their configured monitoring time period FOR v_row IN SELECT j.job_name FROM @extschema@.job_log j JOIN @extschema@.job_check_config c ON j.job_name = c.job_name JOIN pg_catalog.pg_stat_activity a ON j.pid = a.procpid WHERE j.start_time > (CURRENT_TIMESTAMP - p_history) AND (CURRENT_TIMESTAMP - j.start_time) >= least(c.warn_threshold, c.error_threshold) AND j.end_time IS NULL LOOP UPDATE jobmon_check_job_status_temp t SET alert_status = 'RUNNING' , alert_text = (SELECT COALESCE('Currently running. Last completed run: '||max(end_time), 'Currently running. Job has not had a completed run within configured monitoring time period.') FROM @extschema@.job_log WHERE job_log.job_name = v_row.job_name AND job_log.start_time > (CURRENT_TIMESTAMP - p_history)) WHERE t.job_name = v_row.job_name; END LOOP; -- Jobs blocked by locks FOR v_row IN SELECT j.job_name FROM @extschema@.job_log j JOIN pg_catalog.pg_locks l ON j.pid = l.pid JOIN pg_catalog.pg_stat_activity a ON j.pid = a.procpid WHERE j.start_time > (CURRENT_TIMESTAMP - p_history) AND NOT l.granted LOOP UPDATE jobmon_check_job_status_temp t SET alert_status = 'BLOCKED' , alert_text = COALESCE('Another transaction has a lock that blocking this job from completing') WHERE t.job_name = v_row.job_name; END LOOP; END IF; -- end version check IF IF v_problem_count > 0 THEN FOR v_row IN SELECT t.alert_code, t.alert_status, t.job_name, t.alert_text FROM jobmon_check_job_status_temp t ORDER BY alert_code DESC, job_name ASC, alert_status ASC LOOP alert_code := v_row.alert_code; alert_status := v_row.alert_status; job_name := v_row.job_name; alert_text := v_row.alert_text; RETURN NEXT; END LOOP; ELSE alert_code := 1; alert_status := 'OK'; job_name := NULL; alert_text := 'All jobs run successfully'; RETURN NEXT; END IF; DROP TABLE IF EXISTS jobmon_check_job_status_temp; END $$; /* * Helper function to allow calling without an argument. */ CREATE OR REPLACE FUNCTION check_job_status(OUT alert_code int, OUT alert_status text, OUT job_name text, OUT alert_text text) RETURNS SETOF record LANGUAGE plpgsql STABLE AS $$ DECLARE v_longest_period interval; v_row record; BEGIN -- Interval doesn't matter if nothing is in job_check_config. Just give default of 1 week. -- Still monitors for any 3 consecutive failures. SELECT COALESCE(greatest(max(error_threshold), max(warn_threshold)), '1 week') INTO v_longest_period FROM @extschema@.job_check_config; FOR v_row IN SELECT q.alert_code, q.alert_status, q.job_name, q.alert_text FROM @extschema@.check_job_status(v_longest_period) q LOOP alert_code := v_row.alert_code; alert_status := v_row.alert_status; job_name := v_row.job_name; alert_text := v_row.alert_text; RETURN NEXT; END LOOP; END $$;