AnsweredAssumed Answered

Jobtracker Down Alarm

Question asked by tc_dev on Oct 6, 2014
Latest reply on Oct 8, 2014 by tc_dev
After upgrading to 4.0.1 (from 3.0.1) Jobtracker periodically fails with an alarm ("Can not determine if service: jobtracker is running. Check logs at: /opt/mapr/hadoop/hadoop-0.20.2/logs"). The error is not always visible since apparently Warden restarts it back promptly enough.

It is the most suspicious that the WCS alarm references Hadoop1 logs ( /opt/mapr/hadoop/hadoop-0.20.2/logs ) while the log path in WCS service list references Hadoop2 logs ( /opt/mapr/hadoop/hadoop-2.4.1/logs ). The cluster is supposedly switched to YARN after upgrade, however Hadoop2 logs directory is empty...

Additionally, while alarm is up this message shows up in WCS: "ERROR: Could not retrieve metric data"

EDIT: Here is my warden.conf

    services=webserver:all:cldb;jobtracker:1:cldb;tasktracker:all:jobtracker;hbmaster:all:cldb;hbregionserver:all:hbmaster;nfs:all:cldb;kvstore:all;cldb:all:kvstore;hoststats:all:kvstore
    service.command.jt.start=/opt/mapr/hadoop/hadoop-0.20.2/bin/hadoop-daemon.sh start jobtracker
    service.command.tt.start=/opt/mapr/hadoop/hadoop-0.20.2/bin/hadoop-daemon.sh start tasktracker
    service.command.hbmaster.start=/opt/mapr/hbase/hbase-0.98.4/bin/hbase-daemon.sh start master
    service.command.hbregion.start=/opt/mapr/hbase/hbase-0.98.4/bin/hbase-daemon.sh start regionserver
    service.command.cldb.start=/etc/init.d/mapr-cldb start
    service.command.kvstore.start=/etc/init.d/mapr-mfs start
    service.command.mfs.start=/etc/init.d/mapr-mfs start
    service.command.nfs.start=/etc/init.d/mapr-nfsserver start -p 2059
    service.command.hoststats.start=/etc/init.d/mapr-hoststats start
    service.command.webserver.start=/opt/mapr/adminuiapp/webserver start
    service.command.jt.stop=/opt/mapr/hadoop/hadoop-0.20.2/bin/hadoop-daemon.sh stop jobtracker
    service.command.tt.stop=/opt/mapr/hadoop/hadoop-0.20.2/bin/hadoop-daemon.sh stop tasktracker
    service.command.hbmaster.stop=/opt/mapr/hbase/hbase-0.98.4/bin/hbase-daemon.sh stop master
    service.command.hbregion.stop=/opt/mapr/hbase/hbase-0.98.4/bin/hbase-daemon.sh stop regionserver
    service.command.cldb.stop=/etc/init.d/mapr-cldb stop
    service.command.kvstore.stop=/etc/init.d/mapr-mfs stop
    service.command.mfs.stop=/etc/init.d/mapr-mfs stop
    service.command.nfs.stop=/etc/init.d/mapr-nfsserver stop
    service.command.hoststats.stop=/etc/init.d/mapr-hoststats stop
    service.command.webserver.stop=/opt/mapr/adminuiapp/webserver stop
    service.command.jt.type=BACKGROUND
    service.command.tt.type=BACKGROUND
    service.command.hbmaster.type=BACKGROUND
    service.command.hbregion.type=BACKGROUND
    service.command.cldb.type=BACKGROUND
    service.command.kvstore.type=BACKGROUND
    service.command.mfs.type=BACKGROUND
    service.command.nfs.type=BACKGROUND
    service.command.hoststats.type=BACKGROUND
    service.command.webserver.type=BACKGROUND
    service.command.jt.monitor=org.apache.hadoop.mapred.JobTracker
    service.command.tt.monitor=org.apache.hadoop.mapred.TaskTracker
    service.command.hbmaster.monitor=org.apache.hadoop.hbase.master.HMaster start
    service.command.hbregion.monitor=org.apache.hadoop.hbase.regionserver.HRegionServer start
    service.command.cldb.monitor=com.mapr.fs.cldb.CLDB
    service.command.kvstore.monitor=server/mfs
    service.command.mfs.monitor=server/mfs
    service.command.nfs.monitor=server/nfsserver
    service.command.jt.monitorcommand=/opt/mapr/hadoop/hadoop-0.20.2/bin/hadoop-daemon.sh status jobtracker
    service.command.tt.monitorcommand=/opt/mapr/hadoop/hadoop-0.20.2/bin/hadoop-daemon.sh status tasktracker
    service.command.hbmaster.monitorcommand=/opt/mapr/hbase/hbase-0.98.4/bin/hbase-daemon.sh status master
    service.command.hbregion.monitorcommand=/opt/mapr/hbase/hbase-0.98.4/bin/hbase-daemon.sh status regionserver
    service.command.cldb.monitorcommand=/etc/init.d/mapr-cldb status
    service.command.kvstore.monitorcommand=/etc/init.d/mapr-mfs status
    service.command.mfs.monitorcommand=/etc/init.d/mapr-mfs status
    service.command.nfs.monitorcommand=/etc/init.d/mapr-nfsserver status
    service.command.hoststats.monitorcommand=/etc/init.d/mapr-hoststats status
    service.command.webserver.monitorcommand=/opt/mapr/adminuiapp/webserver status
    # Memory allocation for JobTracker is only used
    # to calculate total memory required for all services to run
    # but -Xmx JobTracker itself is not set allowing memory
    # on JobTracker to grow as needed
    # if upper limit on memory is strongly desired
    # set HADOOP_HEAPSIZE env. variable in /opt/mapr/hadoop/hadoop-0.20.2/conf/hadoop-env.sh
    service.command.jt.heapsize.percent=10
    service.command.jt.heapsize.max=5000
    service.command.jt.heapsize.min=256
    # Memory allocation for TaskTracker is only used
    # to calculate total memory required for all services to run
    # but -Xmx TaskTracker itself is not set allowing memory
    # on TaskTracker to grow as needed
    # if upper limit on memory is strongly desired
    # set HADOOP_HEAPSIZE env. variable in /opt/mapr/hadoop/hadoop-0.20.2/conf/hadoop-env.sh
    service.command.tt.heapsize.percent=2
    #service.command.tt.heapsize.percent=20
    #service.command.tt.heapsize.percent=26
    #service.command.tt.heapsize.percent=15
    service.command.tt.heapsize.max=3250
    service.command.tt.heapsize.min=64
    service.command.hbmaster.heapsize.percent=4
    service.command.hbmaster.heapsize.max=512
    service.command.hbmaster.heapsize.min=64
    service.command.hbregion.heapsize.percent=25
    #service.command.hbregion.heapsize.percent=15
    service.command.hbregion.heapsize.max=4000
    service.command.hbregion.heapsize.min=500
    #service.command.cldb.heapsize.percent=8
    service.command.cldb.heapsize.percent=3
    service.command.cldb.heapsize.max=4000
    service.command.cldb.heapsize.min=256
    service.command.mfs.heapsize.percent=20
    service.command.mfs.heapsize.percent=15
    service.command.mfs.heapsize.percent=10
    service.command.mfs.heapsize.maxpercent=85
    service.command.mfs.heapsize.min=512
    service.command.webserver.heapsize.percent=3
    service.command.webserver.heapsize.max=750
    service.command.webserver.heapsize.min=256
    service.command.nfs.heapsize.percent=3
    service.command.nfs.heapsize.min=64
    service.command.nfs.heapsize.max=1000
    service.command.os.heapsize.percent=10
    #service.command.os.heapsize.percent=5
    service.command.os.heapsize.max=4000
    service.command.os.heapsize.min=256
    service.command.warden.heapsize.percent=1
    service.command.warden.heapsize.max=750
    service.command.warden.heapsize.min=64
    service.command.zk.heapsize.percent=1
    service.command.zk.heapsize.max=1500
    service.command.zk.heapsize.min=256
    service.nice.value=-10
    zookeeper.servers=localhost:5181
    nodes.mincount=1
    services.retries=3
    cldb.port=7222
    mfs.port=5660
    hbmaster.port=60000
    hoststats.port=5660
    jt.port=9001
    jt.http.port=50030
    kvstore.port=5660
    mapr.home.dir=/opt/mapr
    centralconfig.enabled=true
    pollcentralconfig.interval.seconds=300
    rpc.drop=false
    hs.rpcon=true
    hs.port=1111
    hs.host=localhost
    service.command.cldb.retryinterval.time.sec=600
    services.retryinterval.time.sec=1800
    jt.response.timeout.minutes=10
    isM7=0
    enable.overcommit=true

Outcomes