AnsweredAssumed Answered

CLDB Failure Impacting Control System and maprcli

Question asked by gunnar_tapper on Jul 19, 2013
Latest reply on Jul 23, 2013 by nabeel
I have a small three-node test system with CLDB running on node 1 and 3, and the Control System webserver plus the metrics database are running on node 2:

    maprcli dashboard info -json
    {
            "timestamp":1374273183924,
            "status":"OK",
            "total":1,
            "data":[
                    {
                            "version":"2.1.1.18569.GA",
                            "cluster":{
                                    "name":"my.cluster.com",
                                    "ip":"x.x.x.x",
                                    "id":"6890087335535363390"
                            },
                            "volumes":{
                                    "mounted":{
                                            "total":16,
                                            "size":1096
                                    },
                                    "unmounted":{
                                            "total":1,
                                            "size":0
                                    }
                            },
                            "utilization":{
                                    "cpu":{
                                            "util":11,
                                            "total":12,
                                            "active":1
                                    },
                                    "memory":{
                                            "total":47853,
                                            "active":13708
                                    },
                                    "disk_space":{
                                            "total":41,
                                            "active":4
                                    },
                                    "compression":{
                                            "compressed":1,
                                            "uncompressed":7
                                    }
                            },
                            "services":{
                                    "fileserver":{
                                            "active":3,
                                            "stopped":0,
                                            "failed":0,
                                            "total":3
                                    },
                                    "hbregionserver":{
                                            "active":3,
                                            "stopped":0,
                                            "failed":0,
                                            "total":3
                                    },
                                    "nfs":{
                                            "active":0,
                                            "stopped":0,
                                            "failed":1,
                                            "total":1
                                    },
                                    "webserver":{
                                            "active":1,
                                            "stopped":0,
                                            "failed":0,
                                            "total":1
                                    },
                                    "cldb":{
                                            "active":2,
                                            "stopped":0,
                                            "failed":0,
                                            "total":2
                                    },
                                    "tasktracker":{
                                            "active":3,
                                            "stopped":0,
                                            "failed":0,
                                            "total":3
                                    },
                                    "jobtracker":{
                                            "active":1,
                                            "standby":0,
                                            "stopped":0,
                                            "failed":0,
                                            "total":1
                                    },
                                    "hoststats":{
                                            "active":3,
                                            "stopped":0,
                                            "failed":0,
                                            "total":3
                                    },
                                    "hbmaster":{
                                            "active":1,
                                            "stopped":0,
                                            "failed":0,
                                            "total":1
                                    }
                            },
                            "mapreduce":{
                                    "running_jobs":0,
                                    "queued_jobs":0,
                                    "running_tasks":0,
                                    "running_map_tasks":0,
                                    "running_reduce_tasks":0,
                                    "map_task_capacity":3,
                                    "reduce_task_capacity":3,
                                    "map_task_prefetch_capacity":0,
                                    "blacklisted":0
                            }
                    }
            ]
    }

Thus, I believe I've set up CLDB for HA? But, when I fail node 1, then I can no longer use the MapR Control System:

    2:24:42 PM - Couldn't connect to the CLDB service
    2:24:42 PM - Couldn't connect to the CLDB service. Check if at least one CLDB is running.

maprcli complains with a similar error.

Do I need to do something beyond configuring CLDB to run in two nodes ensure that the Control System and maprcli remain useful in a failure mode?

Outcomes