#!/bin/bash

# See details at https://confluence.eng.vmware.com/pages/viewpage.action?pageId=216826702

function sendEmailToCustomer {
    logger "[UM Monitor] Attempting to send email with arguments $1 $2"
    lib=$appDir/WEB-INF/lib
    # All the classes which are used to send email are situted in the om.vmware.cloud.usgmtr.mail.* package
    # This means com.vmware.cloud.usgmtr.mail.$1 is the exact class which needs to be executed by the Java
    sudo -u usgmtr sh -c "/opt/vmware/cloudusagemetering/scripts/umsetenv.sh; /usr/java/latest/bin/java -cp $webappClasses:$lib/* com.vmware.cloud.usgmtr.mail.$1 $2"
    logger "[UM Monitor] Sent email to customers"
}

# -------------------------------------------------------------------------------------------------------
# -------------------------------------------------------------------------------------------------------
CURL='/usr/bin/curl'
UMIP="127.0.0.1"
COUNTER_FN='/tmp/umhealth_counter.txt'
LAST_RESTART_FN='/tmp/um_last_reboot.txt'
LOCK_FILE='/tmp/um-migration.lock'

# Request: https://bugzilla.eng.vmware.com/show_bug.cgi?id=1898100
if [ -e $LOCK_FILE ]; then
    logger "[UM Monitor] Skipping health check because of migration locking"
    exit 0;
fi

# During this time (seconds) UM status will not be checked (10 min)
UM_START_TIMEOUT=580
# If UM restated more then once during this time interval (seconds), customer will get an alert email. 43200 sec is 12 hours
UM_CRITICAL_RESTART_TIMEINTERVAL=43200

# If UM was just started, don't do any tests.
UPSECS=$(cat /proc/uptime|cut -f1 -d".")
if [ $UPSECS -lt $UM_START_TIMEOUT ]; then
    logger "[UM Monitor] Skipping health check because of short OS uptime"
    exit 0;
fi

# If reboot was recently, don't do anything
touch $LAST_RESTART_FN
LAST_RESTART_TIME_INTERVAL=$(< $LAST_RESTART_FN)
if [ -z $LAST_RESTART_TIME_INTERVAL ]; then
   LAST_RESTART_TIME_INTERVAL='1000000';
else
   (( LAST_RESTART_TIME_INTERVAL = `date +%s` - LAST_RESTART_TIME_INTERVAL ))
fi
if [ $LAST_RESTART_TIME_INTERVAL -lt $UM_START_TIMEOUT ]; then
    logger "[UM Monitor] Skipping health check because of resent restart"
    exit 0;
fi

# Update the counter. Counter need to run quick operations more frequently  then others.
touch $COUNTER_FN
RunCounter=$(< $COUNTER_FN )
if [ -z $RunCounter ]; then
   RunCounter='0';
fi
(( RunCounter++ ))
echo "$RunCounter" > "$COUNTER_FN"

# -------------------------------------------------------------------
# -------------------------------------------------------------------
source `dirname $0`/umsetenv.sh

# Failed flag. Set it to 1 if test is failed and reboot needed.
TEST_FAILED=0
# Error code. Please put into the code somethign not scary to customer. The code will be send to customer in email
ERROR_CODE=''

# Check UM status. This request fails if:
#  - tomcat is offline
#  - UM internal state is not good
if [ $TEST_FAILED -eq 0 ]; then
    # Timeout is 30 seconds because UM can be busy and we don't want to restart it because of that.
    restartUM=$($CURL --connect-timeout 30 --max-time 30 --silent -k https://$UMIP:8443/um/api/health)
    healthCode=$(echo $restartUM | sed -n -e 's/.*<health_code>\(.*\)<\/health_code>.*/\1/p')

    case $healthCode in
        "OK")
           logger "[UM Monitor] API um/api/health return OK status"
           ;;
        *)
           # UM didn't return that no restart is needed.
           logger "[UM Monitor] API um/api/health failed or return wrong respond $restartUM"
           TEST_FAILED=1
           ERROR_CODE="$healthCode"
           if [ -z  $ERROR_CODE  ]; then
               ERROR_CODE="ERR_TMC" # tomcat not running error code
           fi
           ;;
    esac
fi

# Check Number of open file descriptors by UM
if [ $TEST_FAILED -eq 0 ]; then
    um_pid=`ps aux | grep tomcat | grep catalina | grep start | awk '{print $2}'`
    um_fd_num=`ls /proc/$um_pid/fd | wc -l`
    logger "[UM Monitor] UM open file descriptors number is $um_fd_num"
    if [ $um_fd_num -gt 2000 ]; then
        TEST_FAILED=1
        ERROR_CODE="ERR_FDN" # too many file descriptor (Number)
    fi
fi

# collection test will be done once in an hour and a half.
# Checking the end of an hour because collection after reboot must happen
if [ $TEST_FAILED -eq 0 ]; then
  if [ $(( $RunCounter % 12 )) -eq 11 ] && [ $RunCounter -gt 20 ]; then
    lastCollectionDays=$(/opt/vmware/vpostgres/current/bin/psql -t -d usgmtr -U usgmtr -c 'select extract(day from (select now() - start from "Collection" order by start desc limit 1))')
    logger "[UM Monitor] Testing when collection was done last time... Get value $lastCollectionDays days"

    # No value - is an error. Collection expected to be finished
    if [ -z $lastCollectionDays ]; then
        lastCollectionDays="1000"
    fi

    #send a mail if there wasn't collection for more than a day
    if [ $lastCollectionDays -gt 1 ]; then
        # First let's try to reboot. If it doesn't help, then send an email to customer.
        if [ $LAST_RESTART_TIME_INTERVAL -lt $UM_CRITICAL_RESTART_TIMEINTERVAL ]; then
            logger "[UM Monitor] Sending email to customer about this problem"
            # Note, we can do reboot here, but it is more safe to ask customer for that because we don't know the reason of failure.
            sendEmailToCustomer "EmailCustomerShouldRestartMessage"
        else
            logger "[UM Monitor] Attempting to restart UM to fix the collection problem"
            TEST_FAILED=1
            ERROR_CODE="ERR_COL" # colleciton problems
        fi
    fi
  fi
fi

# Request: https://bugzilla.eng.vmware.com/show_bug.cgi?id=1898100
if [ -e $LOCK_FILE ]; then
    logger  "[UM Monitor] Skipping health check because of migration locking"
    exit 0;
fi

# Check if failure was reported and we need to restart the tomcat and postgres
if [ $TEST_FAILED -eq 1 ]; then

    # It is importand to update restart time before and after because it works as a locker
    # restart process might take some time.
    date +%s > $LAST_RESTART_FN

    logger "[UM Monitor] Error code is $ERROR_CODE, restarting tomcat and postgres"
    /sbin/service tomcat stop
    /sbin/service vpostgres restart
    /sbin/service tomcat start
    logger "[UM Monitor] Finish restarting tomcat and postgres"

    date +%s > $LAST_RESTART_FN
    echo "0" > $COUNTER_FN

    # Send email to customer only if reboot happens more then once in 3 hours
    if [ $LAST_RESTART_TIME_INTERVAL -lt $UM_CRITICAL_RESTART_TIMEINTERVAL ]; then
        logger "[UM Monitor] Sending email about reboot with error code $ERROR_CODE"
        sendEmailToCustomer "EmailRestartMessage" "$ERROR_CODE"
    else
        logger "[UM Monitor] Skipping sending email because last restart was $LAST_RESTART_TIME_INTERVAL seconds ago"
    fi
else
    logger "[UM Monitor] Finish checking with success"
fi

