1. 需求场景
在自动化邮件报表中,总是会出现邮件发送不及时、邮件发送出错等等问题。因此开发了此系统,当邮件任务执行出错或者未执行时发送短信报警警告
2. 基本思路
在执行邮件任务之前,即在系统调用发送邮件的方法之前,先在邮件检测系统表中记录,状态为失败,发送成功之后更新数据库,置为成功。
shell邮件检测系统每1分钟执行一次,遍历邮件任务表和检测表,2者对比发现失败的任务和未执行的任务。一旦发现,调用短信发送脚本发送短信报警
3. 整体代码
- #!/bin/sh
- ### DOUGUO JOB INSPECTOSCOPE SNIFFER ###
- ### @author zjf ###
- ### @date 2017-12-27 ###
-
- # INITALIZE SHELL
- . /etc/profile
- . ~/.bash_profile
-
- # DEFINE ERROE CODE
- ERROR_CODE_001="job non-executed"
- ERROR_CODE_002="data result non-compliant formats"
- ERROR_CODE_003="send-mail/exec-job non-succeed"
-
- # INITALIZE WORKING DIR
- _SCRIPT_NAME=$0
- _WORK_DIR=`dirname ${_SCRIPT_NAME}`
- cd ${_WORK_DIR}
-
- # TIME ARGS
- CUR_DATE=`date -d "-0 day" "+%Y-%m-%d"`
- JUDGE_MINUTE=`date -d "-5 minute" +%H:%M`
- #JUDGE_MINUTE="08:05"
- echo "CUR_DATE:$CUR_DATE"
- echo "JUDGE_MINUTE:$JUDGE_MINUTE"
-
- # DB ARGS
- HOSTNAME="192.168.1.135"
- USERNAME="hadooper"
- PASSWORD="hadoop@K+IhBOS"
- DBNAME="douguo_data"
-
- # DEFINE _FUN RETURN RESULT
- g_contain_rs=""
-
- # the area to define functions
- # functions must be defined before use it
- # _FUN
- function contain() {
- g_contain_rs="0"
-
- array=$1
- value=$2
-
- for _arr_val in ${array[@]}
- do
- ta_array=`echo ${_arr_val} | cut -d ',' -f 1`
- tb_value=`echo ${value} | cut -d ',' -f 1`
-
- if [ "${ta_array}" = "${tb_value}" ];then
- # 已记录
- g_contain_rs="1"
- fi
- done
- }
-
- # _FUN
- # uodate job's status
- function update_job_status() {
- _jon_id=$1
- _to_change_schema=$2
- _to_change_data=$3
- _stat_date=$4
- query_update="update dd_job_monitor set ${_to_change_schema}='${_to_change_data}' where job_id=${_jon_id} and date(statdate)='${_stat_date}'"
- query_result=`/usr/local/MysqL/bin/MysqL -h${HOSTNAME} -u${USERNAME} -p${PASSWORD} -D ${DBNAME} -e "${query_update}"`
- }
-
- # _FUN
- # insert into job tb,otherwise the alarm module will always alarming !
- function insert_job() {
- _jon_id=$1
- _job_name=$2
- _stat_date=$3
- query_update="INSERT INTO dd_job_monitor(job_id,job_type,send_type,job_name,run_time,msg_reciver,data_status,job_status,alarm_status,statdate) VALUES(${_jon_id},'alarm','${_job_name}','',1,'${_stat_date}')"
- echo "$query_update"
- query_result=`/usr/local/MysqL/bin/MysqL -h${HOSTNAME} -u${USERNAME} -p${PASSWORD} -D ${DBNAME} -e "${query_update}"`
- }
-
- # query languages
- ## !modify log 2018-02-26 :
- # 在QUERYA中,设置条件id not in (36,12),这2个邮件在当天的最后5分钟
- # 本系统有5min延迟,故会产生重复报警,所以不再检测邮件 36,12
- QUERYA="select concat_ws(',',concat_ws('',id,subject),right(send_time,5)) from dd_sys_mail_set where send_type='day' and id not in (36,12);"
- #QUERYA="select concat_ws(',5)) from dd_sys_mail_set where send_type='day';"
- QUERYB="select concat_ws(',job_id,job_name),right(run_time,5),alarm_status) from dd_job_monitor where date(statdate)='${CUR_DATE}'"
-
- # query from MysqL
- count_a=`/usr/local/MysqL/bin/MysqL -h${HOSTNAME} -u${USERNAME} -p${PASSWORD} -D ${DBNAME} -e "${QUERYA}" --skip-column-name`
- count_b=`/usr/local/MysqL/bin/MysqL -h${HOSTNAME} -u${USERNAME} -p${PASSWORD} -D ${DBNAME} -e "${QUERYB}" --skip-column-name`
-
- # echo in data files and delete the first line 'log in file /var/MysqL/query.log'
- echo "${count_a}" | sed 's/\t/,/g' > ${_WORK_DIR}/tmp/queryloga.log
- echo "${count_b}" | sed 's/\t/,/g' > ${_WORK_DIR}/tmp/querylogb.log
- sed -i '1d' ${_WORK_DIR}/tmp/queryloga.log
- sed -i '1d' ${_WORK_DIR}/tmp/querylogb.log
-
- # read log a and create array a
- LOOP_FLAG_A=0
- ARR_RS_A=()
- while read line;
- do
- ARR_RS_A[${LOOP_FLAG_A}]=${line}
- LOOP_FLAG_A=`expr $LOOP_FLAG_A + 1`
- done < ${_WORK_DIR}/tmp/queryloga.log
-
- # read log b and create array b
- LOOP_FLAG_B=0
- ARR_RS_B=()
- while read line;
- do
- ARR_RS_B[${LOOP_FLAG_B}]=${line}
- LOOP_FLAG_B=`expr $LOOP_FLAG_B + 1`
- done < ${_WORK_DIR}/tmp/querylogb.log
-
- # loop the result arr and judge if can send the alarm
- for SYSJOB in ${ARR_RS_A[@]}
- do
- # split the job info
- SYSJOB_MINUTE=`echo $SYSJOB | cut -d ',' -f 2`
- SYSJOB_NAME=`echo $SYSJOB | cut -d ',' -f 1`
- SYSJOB_NAME_SUBJECT=`echo ${SYSJOB_NAME} | cut -d '' -f 2`
- SYSJOB_NAME_ID=`echo ${SYSJOB_NAME} | cut -d '' -f 1`
-
- # if time before 5 min ago equals job run time,it should be recorded in ARR_RS_B
- if [ "${SYSJOB_MINUTE}" = "${JUDGE_MINUTE}" ]; then
-
- # if job logged in job table ? 1 : 0
- contain "${ARR_RS_B[*]}" "${SYSJOB_NAME}"
-
- # 0: job not in tb
- if [ "$g_contain_rs" = "0" ]; then
- error_msg_001="DC_WEB: MAIL: ERROR:001: ${SYSJOB_NAME_ID}: ${SYSJOB_NAME_SUBJECT}: ${ERROR_CODE_001}"
- echo "${error_msg_001}" >> ${_WORK_DIR}/job.log
- `sh /opt/DATA/goldmine/src/utils/sms/sms_send.sh "${error_msg_001}"`
- insert_job "${SYSJOB_NAME_ID}" "${SYSJOB_NAME_SUBJECT}" "${CUR_DATE}"
- else
-
- # test...
- echo "logging...<${CUR_DATE}> the inspectoscope has logged the job:${SYSJOB_NAME_ID}:${SYSJOB_NAME_SUBJECT}" >> ${_WORK_DIR}/job.log
-
- for _LOGTMPJOB in ${ARR_RS_B[@]}
- do
- LOGJOB_NAME=`echo $_LOGTMPJOB | cut -d ',' -f 1`
- LOGJOB_MINUTE=`echo $_LOGTMPJOB | cut -d ',' -f 2`
- LOGJOB_DATA_STATUS=`echo $_LOGTMPJOB | cut -d ',' -f 3`
- LOGJOB_JOB_STATUS=`echo $_LOGTMPJOB | cut -d ',' -f 4`
- LOGJOB_ALARM_STATUS=`echo $_LOGTMPJOB | cut -d ',' -f 5`
- LOGJOB_JOB_ID=`echo $LOGJOB_NAME | cut -d '' -f 1`
- LOGJOB_JOB_NAME=`echo $LOGJOB_NAME | cut -d '' -f 2`
-
- if [ "${SYSJOB_NAME}" = "${LOGJOB_NAME}" ]; then
- # LOGJOB_ALARM_STATUS 1:has warned,0:not warnd
- if [[ "${LOGJOB_DATA_STATUS}" = "-1" && "${LOGJOB_ALARM_STATUS}" = "0" ]]; then
- error_msg_002="DC_WEB: MAIL: ERROR:002: ${LOGJOB_JOB_ID}: ${LOGJOB_JOB_NAME}: ${ERROR_CODE_002}"
- echo "${error_msg_002}" >> ${_WORK_DIR}/job.log
- `sh /opt/DATA/goldmine/src/utils/sms/sms_send.sh "${error_msg_002}"`
- fi
- if [[ "${LOGJOB_JOB_STATUS}" = "-1" && "${LOGJOB_ALARM_STATUS}" = "0" ]]; then
- error_msg_003="DC_WEB: MAIL: ERROR:003: ${LOGJOB_JOB_ID}: ${LOGJOB_JOB_NAME}: ${ERROR_CODE_003}"
- echo "${error_msg_003}" >> ${_WORK_DIR}/job.log
- `sh /opt/DATA/goldmine/src/utils/sms/sms_send.sh "${error_msg_003}"`
- fi
- # update job alarm status
- _fun_param_schema_name="alarm_status"
- _fun_param_schema_data="1"
- update_job_status ${LOGJOB_JOB_ID} ${_fun_param_schema_name} ${_fun_param_schema_data} ${CUR_DATE}
-
- # test...
- echo "logging...<${CUR_DATE}> the inspectoscope has alarm the job:${LOGJOB_JOB_ID}:${LOGJOB_JOB_NAME}" >> ${_WORK_DIR}/job.log
-
- fi
- done
- fi
- fi
- done