-- Critical Bug Fix: Version 1.0 accidentally removed the creation of the trigger on the job_log table so that failing jobs would never cause check_job_status() to report a failed job. Jobs that were configured to run within a certain time period were still monitored for. This only affects new installations of pg_jobmon since 1.0. If you've upgraded from a previous version, the trigger is still working properly. -- Redesigned check_job_status() to return more detailed, and more easily filtered data on the current status of running jobs. Please check how your monitoring software used this function to ensure it can handle the new output format properly. Each problem job is returned in its own row instead of all results being returned in a single row. If a single row is still desired, the highest alert level job in alphabetical order of job_name is always returned first, so a LIMIT 1 can be used as an easy solution. More advanced filtering is now possible, though. See the updated pg_jobmon.md doc for some examples. -- Wrote pgTAP tests and some other custom tests to better validate future changes -- Preserve dropped function privileges. Re-applied at the end of this update. CREATE TEMP TABLE mimeo_preserve_privs_temp (statement text); INSERT INTO mimeo_preserve_privs_temp SELECT 'GRANT EXECUTE ON FUNCTION check_job_status(interval) TO '||array_to_string(array_agg(grantee::text), ',')||';' FROM information_schema.routine_privileges WHERE routine_schema = '@extschema@' AND routine_name = 'check_job_status'; INSERT INTO mimeo_preserve_privs_temp SELECT 'GRANT EXECUTE ON FUNCTION check_job_status() TO '||array_to_string(array_agg(grantee::text), ',')||';' FROM information_schema.routine_privileges WHERE routine_schema = '@extschema@' AND routine_name = 'check_job_status'; CREATE FUNCTION replay_preserved_privs() RETURNS void LANGUAGE plpgsql AS $$ DECLARE v_row record; BEGIN FOR v_row IN SELECT statement FROM mimeo_preserve_privs_temp LOOP EXECUTE v_row.statement; END LOOP; END $$; DROP FUNCTION check_job_status(interval); DROP FUNCTION check_job_status(); DROP TRIGGER IF EXISTS trg_job_monitor ON @extschema@.job_log; CREATE TRIGGER trg_job_monitor AFTER UPDATE ON @extschema@.job_log FOR EACH ROW EXECUTE PROCEDURE @extschema@.job_monitor(); /* * Check Job status * * p_history is how far into job_log's past the check will go. Don't go further back than the longest job's interval that is contained * in job_check_config to keep check efficient * Return code 1 means a successful job run * Return code 2 is for use with jobs that support a warning indicator. Not critical, but someone should look into it * Return code 3 is for use with a critical job failure */ CREATE FUNCTION check_job_status(p_history interval, OUT alert_code int, OUT alert_status text, OUT job_name text, OUT alert_text text) RETURNS SETOF record LANGUAGE plpgsql AS $$ DECLARE v_count int = 1; v_longest_period interval; v_row record; v_rowcount int; v_version int; BEGIN -- Leave this check here in case helper function isn't used and this is called directly with an interval argument SELECT greatest(max(error_threshold), max(warn_threshold)) INTO v_longest_period FROM @extschema@.job_check_config; IF v_longest_period IS NOT NULL THEN IF p_history < v_longest_period THEN RAISE EXCEPTION 'Input argument must be greater than or equal to the longest threshold in job_check_config table'; END IF; END IF; SELECT current_setting('server_version_num')::int INTO v_version; CREATE TEMP TABLE jobmon_check_job_status_temp (alert_code int, alert_status text, job_name text, alert_text text, pid int); -- Check for jobs with three consecutive errors and not set for any special configuration INSERT INTO jobmon_check_job_status_temp (alert_code, alert_status, job_name, alert_text) SELECT l.alert_code, 'FAILED_RUN' AS alert_status, l.job_name, '3 consecutive job failures' AS alert_text FROM @extschema@.job_check_log l WHERE l.job_name NOT IN ( SELECT c.job_name FROM @extschema@.job_check_config c ) GROUP BY l.job_name, l.alert_code HAVING count(*) > 2; GET DIAGNOSTICS v_rowcount = ROW_COUNT; -- Check for jobs with specially configured sensitivity INSERT INTO jobmon_check_job_status_temp (alert_code, alert_status, job_name, alert_text) SELECT l.alert_code, 'FAILED_RUN' as alert_status, l.job_name, count(*)||' consecutive job failure(s)' AS alert_text FROM @extschema@.job_check_log l JOIN @extschema@.job_check_config c ON l.job_name = c.job_name GROUP BY l.job_name, l.alert_code, c.sensitivity HAVING count(*) > c.sensitivity; GET DIAGNOSTICS v_rowcount = ROW_COUNT; -- Check for missing jobs that have configured time thresholds. Jobs that have not run since before the p_history will return pid as NULL INSERT INTO jobmon_check_job_status_temp (alert_code, alert_status, job_name, alert_text, pid) SELECT CASE WHEN l.max_start IS NULL AND l.end_time IS NULL THEN 3 WHEN (CURRENT_TIMESTAMP - l.max_start) > c.error_threshold THEN 3 WHEN (CURRENT_TIMESTAMP - l.max_start) > c.warn_threshold THEN 2 ELSE 3 END AS ac , CASE WHEN (CURRENT_TIMESTAMP - l.max_start) > c.warn_threshold OR l.end_time IS NULL THEN 'MISSING' ELSE l.status END AS alert_status , c.job_name , COALESCE('Last completed run: '||l.max_end, 'Has not completed a run since highest configured monitoring time period') AS alert_text , l.pid FROM @extschema@.job_check_config c LEFT JOIN ( WITH max_start_time AS ( SELECT w.job_name, max(w.start_time) as max_start, max(w.end_time) as max_end FROM @extschema@.job_log w WHERE start_time > (CURRENT_TIMESTAMP - p_history) GROUP BY w.job_name) SELECT a.job_name, a.end_time, a.status, a.pid, m.max_start, m.max_end FROM @extschema@.job_log a JOIN max_start_time m ON a.job_name = m.job_name and a.start_time = m.max_start WHERE start_time > (CURRENT_TIMESTAMP - p_history) ) l ON c.job_name = l.job_name WHERE c.active AND (CURRENT_TIMESTAMP - l.max_start) > c.warn_threshold OR l.max_start IS NULL ORDER BY ac, l.job_name, l.max_start; GET DIAGNOSTICS v_rowcount = ROW_COUNT; -- Check for BLOCKED after RUNNING to ensure blocked jobs are labelled properly IF v_version >= 90200 THEN -- Jobs currently running that have not run before within their configured monitoring time period FOR v_row IN SELECT j.job_name FROM @extschema@.job_log j JOIN @extschema@.job_check_config c ON j.job_name = c.job_name JOIN pg_catalog.pg_stat_activity a ON j.pid = a.pid WHERE j.start_time > (CURRENT_TIMESTAMP - p_history) AND (CURRENT_TIMESTAMP - j.start_time) >= least(c.warn_threshold, c.error_threshold) AND j.end_time IS NULL LOOP UPDATE jobmon_check_job_status_temp t SET alert_status = 'RUNNING' , alert_text = (SELECT COALESCE('Currently running. Last completed run: '||max(end_time), 'Currently running. Job has not had a completed run within configured monitoring time period.') FROM @extschema@.job_log WHERE job_log.job_name = v_row.job_name AND job_log.start_time > (CURRENT_TIMESTAMP - p_history)) WHERE t.job_name = v_row.job_name; END LOOP; -- Jobs blocked by locks FOR v_row IN SELECT j.job_name FROM @extschema@.job_log j JOIN pg_catalog.pg_locks l ON j.pid = l.pid JOIN pg_catalog.pg_stat_activity a ON j.pid = a.pid WHERE j.start_time > (CURRENT_TIMESTAMP - p_history) AND NOT l.granted LOOP UPDATE jobmon_check_job_status_temp t SET alert_status = 'BLOCKED' , alert_text = COALESCE('Another transaction has a lock that blocking this job from completing') WHERE t.job_name = v_row.job_name; END LOOP; ELSE -- version less than 9.2 with old procpid column -- Jobs currently running that have not run before within their configured monitoring time period FOR v_row IN SELECT j.job_name FROM @extschema@.job_log j JOIN @extschema@.job_check_config c ON j.job_name = c.job_name JOIN pg_catalog.pg_stat_activity a ON j.pid = a.procpid WHERE j.start_time > (CURRENT_TIMESTAMP - p_history) AND (CURRENT_TIMESTAMP - j.start_time) >= least(c.warn_threshold, c.error_threshold) AND j.end_time IS NULL LOOP UPDATE jobmon_check_job_status_temp t SET alert_status = 'RUNNING' , alert_text = (SELECT COALESCE('Currently running. Last completed run: '||max(end_time), 'Currently running. Job has not had a completed run within configured monitoring time period.') FROM @extschema@.job_log WHERE job_log.job_name = v_row.job_name AND job_log.start_time > (CURRENT_TIMESTAMP - p_history)) WHERE t.job_name = v_row.job_name; END LOOP; -- Jobs blocked by locks FOR v_row IN SELECT j.job_name FROM @extschema@.job_log j JOIN pg_catalog.pg_locks l ON j.pid = l.pid JOIN pg_catalog.pg_stat_activity a ON j.pid = a.procpid WHERE j.start_time > (CURRENT_TIMESTAMP - p_history) AND NOT l.granted LOOP UPDATE jobmon_check_job_status_temp t SET alert_status = 'BLOCKED' , alert_text = COALESCE('Another transaction has a lock that blocking this job from completing') WHERE t.job_name = v_row.job_name; END LOOP; END IF; -- end version check IF IF v_rowcount IS NOT NULL AND v_rowcount > 0 THEN FOR v_row IN SELECT t.alert_code, t.alert_status, t.job_name, t.alert_text FROM jobmon_check_job_status_temp t ORDER BY alert_code DESC, job_name ASC, alert_status ASC LOOP alert_code := v_row.alert_code; alert_status := v_row.alert_status; job_name := v_row.job_name; alert_text := v_row.alert_text; RETURN NEXT; END LOOP; ELSE alert_code := 1; alert_status := 'OK'; job_name := NULL; alert_text := 'All jobs run successfully'; RETURN NEXT; END IF; DROP TABLE IF EXISTS jobmon_check_job_status_temp; END $$; /* * Helper function to allow calling without an argument. */ CREATE FUNCTION check_job_status(OUT alert_code int, OUT alert_status text, OUT job_name text, OUT alert_text text) RETURNS SETOF record LANGUAGE plpgsql STABLE AS $$ DECLARE v_longest_period interval; v_row record; BEGIN -- Interval doesn't matter if nothing is in job_check_config. Just give default of 1 week. -- Still monitors for any 3 consecutive failures. SELECT COALESCE(greatest(max(error_threshold), max(warn_threshold)), '1 week') INTO v_longest_period FROM @extschema@.job_check_config; FOR v_row IN SELECT q.alert_code, q.alert_status, q.job_name, q.alert_text FROM @extschema@.check_job_status(v_longest_period) q LOOP alert_code := v_row.alert_code; alert_status := v_row.alert_status; job_name := v_row.job_name; alert_text := v_row.alert_text; RETURN NEXT; END LOOP; END $$; -- Restore original privileges to objects that were dropped SELECT @extschema@.replay_preserved_privs(); DROP FUNCTION @extschema@.replay_preserved_privs(); DROP TABLE mimeo_preserve_privs_temp;