Print system messages on an LCD display

At work, I manage a cluster of several hundred nodes. All those nodes are equipped with a little LCD display, like to one showed in the picture on the right. Until now, these displays were only displaying the name of the cluster vendor at boot, and the node name when the machine was up an running. To put these neat little displays to better use, I wrote a Python program named lcd.py, which prints more useful information.

When started, lcd.py burns into the MVRAM of the display the name of the company I am working for and the name of the node, which is in fact the position of the node in the rack. This helps me find which node is which when I take several of them out of the rack for maintenance. I just power the machine for a few seconds to read its name and position on the racks on the LCD display.

Then, the daemon goes into an infinite loop to probe the disk temperature using the S.M.A.R.T information provided by the disk. This information is then displayed on the second line of the LCD display. A quick look at the display and I know if something is wrong with the disk or the machine. Of course this is not the only monitoring system used to watch the system but every little detail helps. And it is fun to use these little displays.

The program also pings the “master node” to see if there are any network problems. If the “master node” cannot be reached, the program turns off the LCD display backlight. A quick glance at the entire rack and you know which node is in trouble.

For more information on these LCD display you can download the manual

Download the entire source code

#!/usr/bin/env python
#
"""
Usage: lcd [options]

Options:
    -h, --help
        print this text.
    -f, --foreground
        run the progrand in foreground (for debugging purpose)

lcd is a deamon who print the health health status of the petabox on
the small lcd display.
 - The first line display the home name and mac address of the node.
 - The second line prints the disk temperature.
 - The display backlignt goes off it the network is down.

"""

import array
import commands
import os
import signal
import socket, fcntl
import sys
import termios
import time
import select
import threading, thread

from struct import pack, unpack, calcsize
from lib import mkdaemon

notOpen = "Serial port not open"
DISK_PROBE_TIME = 900
DEFAULT_TTY = '/dev/ttyS1'
PIDFILE = '/var/run/lcd.pid'
WORKDIR = '/'
UMASK = 0
MAXFD = 1024
SMART_DISKNOTFOUND = 0x02
DEFAULT_MSG = "    Twitter    " + "    @0x9900    "

ICMP_TYPE = 8
ICMP_CODE = 0
ICMP_CHECKSUM = 0
ICMP_ID = 0
ICMP_SEQ_NR = 0

HEARTBEAT_PACKET_SIZE = 56
HEARTBEAT_PROBE_TIME = 10
HEARTBEAT_SERVER = 'gateway.home'

class SerialError(Exception):
    """Base class for Serial related exceptions."""
    pass

class Serial(object):
    """
    Quick and dirty Serial class to communicate with the petabox small
    LCD display
    """
    def __init__(self, port):
        self.port = port
        try:
            self.fd = os.open(port, os.O_RDWR|os.O_NOCTTY|os.O_NONBLOCK)
        except Exception, msg:
            self.fd = None
            raise SerialError("could not open port %s: %s" % (port, msg))

        try:
            self._configPort()
        except Exception, msg:
            os.close(self.fd)
            self.fd = None
            raise SerialError("could not configur port %s: %s" % (port, msg))

        return

    def close(self):
        """Close port"""
        if self.fd is not None:
            os.close(self.fd)
            self.fd = None
        return

    def read(self, size=1):
        if self.fd is None:
            raise SerialError(notOpen)

        read = ''
        inp = None
        if size > 0:
            buf = os.read(self.fd, size-len(read))
            read = read + buf

        return read

    def write(self, data):
        """Output the given string over the serial port."""
        if self.fd is None:
            raise SerialError(notOpen)

        t = len(data)
        d = data
        while t > 0:
            n = os.write(self.fd, d)
            d = d[n:]
            t = t - n
        return

    def flush(self):
        if self.fd is None:
            raise SerialError(notopen)
        termios.tcdrain(self.fd)
        return

    def _configPort(self):
        try:
            iflag, oflag, cflag, lflag, ispeed, ospeed, cc = \
                termios.tcgetattr(self.fd)
        except termios.error, msg:
            raise SerialError("Could not configure port: %s" % msg)
        #set up raw mode / no echo / binary
        cflag |=  (termios.CLOCAL | termios.CREAD | termios.CS8)

        lflag &= ~(termios.ICANON | termios.ECHO | termios.ECHOE |
                   termios.ECHOK | termios.ECHONL | termios.ISIG |
                   termios.IEXTEN | termios.ECHOCTL | termios.ECHOKE)
        oflag &= ~(termios.OPOST)

        iflag &= ~(termios.INLCR | termios.IGNCR | termios.ICRNL |
                   termios.IGNBRK | termios.IXON | termios.IXOFF |
                   termios.IXANY)

        ispeed = ospeed = getattr(termios,'B19200')
        termios.tcsetattr(self.fd, termios.TCSANOW,
                          [iflag, oflag, cflag, lflag, ispeed, ospeed, cc])


class LCD(Serial):
    # The text codes can be found at
    # http://www.matrixorbital.ca/manuals/LK_series/LK162-12/LK162-12_200.pdf
    def __init__(self, msg=DEFAULT_MSG):
        """
        Initialise the serial connection and set the LCD display
        with the boot message.
        """
        super(LCD, self).__init__(DEFAULT_TTY)
        self.bootmsg = msg[:32]
        self.write("%c%c%s" % (0xFE, 0x40, self.bootmsg))

        # try to determine the deg characterh by checking the version
        self.DEG = 0xB2
        try:
            self.write("%c%c" % (0xFE, 0x36))
            time.sleep(1)
            if self.read() == '1':
                self.DEG = 0xDF
        except:
            pass

        return

    def reset(self):
        super(LCD, self).write(self.bootmsg)
        return

    def clear(self):
        """
        Clear the screen
        """
        self.write("%c%c" % (0xfe, 0x58))
        self.flush()
        return

    def backlight_on(self):
        """
        turn on the display's backlight
        """
        self.write("%c%c%c" % (0xfe, 0x42, 0x0))
        return

    def backlight_off(self):
        """
        turn off the display's backlight
        """
        self.write("%c%c" % (0xfe, 0x46))
        return

    def write(self, msg):
        super(LCD, self).write(msg)
        return

    def writeln(self, lineno, msg):
        """
        Write the text 'msg' to a given line. There is two lines [1-2]
        """
        line = "%c%c%c%c%-16.16s" % (0xFE, 0x47, 1, lineno, msg[:16])
        self.write(line)
        return


class SmartError(Exception):
    """SmartInfo errors handler"""
    pass

class SmartInfo:
    """
    Get the disk smart information
    """
    def __init__(self):
        self.disks = {}
        self.last_probe = 0
        self.smart = '/usr/sbin/smartctl'
        self.args = "-AHd ata"
        self.initialized = True

        # Get the disks names
        (status, output) = commands.getstatusoutput('fdisk -l')
        if status != 0:
            self.initialized = False
            return
        for line in output.split('\n'):
            if not line.startswith('Disk /dev'): continue
            fields = line.split()
            self.disks.setdefault(fields[1][:-1], '*')
        self.update()
        return

    def update(self):
        """
        Update the smart information if DISK_PROBE_TIME is reached.
        """
        if self.initialized == False:
            raise SmartError('SMART: init err') # 16 char to fit the screen

        now = int(time.time())
        if self.last_probe + DISK_PROBE_TIME > now:
            return

        self.last_probe = now
        for disk in self.disks.keys():
            self.disks[disk] = self._probedisk(disk)

    def _probedisk(self, disk):
        # get disk temp
        health = "NotFound"
        temperature = "0"
        cmd = ' '.join((self.smart, self.args, disk))
        (status, output) = commands.getstatusoutput(cmd)
        if status & SMART_DISKNOTFOUND:
            return ('Unknown', '*')

        lines = [l.split() for l in output.split('\n') if l[:-1] != '']
        for fields in lines:
            if fields[1].lower() == "overall-health":
                if fields[-1].upper() == 'PASSED':
                    health = 'OK'
                else:
                    health = 'ERR'
                break

        # get disk the temperature
        for fields in lines:
            try:
                id = int(fields[0])
            except ValueError:
                continue
            if id == 194:
                temperature = fields[9]
                break;

        return (health, temperature)


class OSInfo:
    """Get os information"""
    def __init__(self):
        self.macaddr = self.get_macaddr('eth0')
        self.hostname = self.get_hostname()

    def get_macaddr(self, iface):
        """
        Get the mac address for the interface iface
        """
        SIOCGIFHWADDR = 0x8927  # Value from /usr/include/linux/sockios.h
        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        req = pack('16s16s', iface, '')
        try:
            info = fcntl.ioctl(s.fileno(), SIOCGIFHWADDR, req)
        except IOError:
            return "00:00:00:00:00:00"
        return ':'.join(["%0.2X" % v for v in unpack('BBBBBB',info[18:24])])

    def get_hostname(self):
        name = socket.gethostname()
        try:
            return name.split('.')[0]
        except:
            return name


# This version of ping is the stripped down version of the ping.py by
# Lars Strand <lars strand at gnist org> we only need to see if the
# network is up and running.

def _construct(id):
    """
    Constructs a ICMP echo packet of variable size
    """
    # construct header
    header = pack('bbHHh', ICMP_TYPE, ICMP_CODE, ICMP_CHECKSUM, \
                         ICMP_ID, ICMP_SEQ_NR+id)
    # space for time
    size = HEARTBEAT_PACKET_SIZE - calcsize("d")
    data = pack("d", time.time()) + 'X' * size
    packet = header + data          # ping packet without checksum
    checksum = _in_cksum(packet)    # make checksum

    # construct header with correct checksum
    header = pack('bbHHh', ICMP_TYPE, ICMP_CODE, checksum, ICMP_ID, \
                         ICMP_SEQ_NR+id)

    # ping packet *with* checksum
    packet = header + data

    # a perfectly formatted ICMP echo packet
    return packet

def _in_cksum(packet):
    """THE RFC792 states: 'The 16 bit one's complement of
    the one's complement sum of all 16 bit words in the header.'

    Generates a checksum of a (ICMP) packet. Based on in_chksum found
    in ping.c on FreeBSD.
    """

    # add byte if not dividable by 2
    if len(packet) & 1:
        packet = packet + '\0'

    # split into 16-bit word and insert into a binary array
    words = array.array('h', packet)
    sum = 0

    # perform ones complement arithmetic on 16-bit words
    for word in words:
        sum += (word & 0xffff)

    hi = sum >> 16
    lo = sum & 0xffff
    sum = hi + lo
    sum = sum + (sum >> 16)

    return (~sum) & 0xffff # return ones complement


def pingNode(node, sock, timeout=1.0):
    """
    Pings a node based on input given to the function.
    returns False for dead, and True when alive.
    """
    pid = os.getpid()
    packet = _construct(pid) # make a ping packet

    # send the ping
    try:
        sock.sendto(packet,(node,1))
    except socket.error, e:
        return False

    # reset values
    pong = ""; iwtd = []

    # wait until there is data in the socket
    while 1:
        # input, output, exceptional conditions
        iwtd, owtd, ewtd = select.select([sock], [], [], timeout)
        break # no data and timout occurred

    # data on socket - this means we have an answer
    if iwtd:  # ok, data on socket
        # read data (we only need the header)
        pong, address = sock.recvfrom(HEARTBEAT_PACKET_SIZE + 48)

        # fetch pong header
        pongHeader = pong[20:28]
        pongType, pongCode, pongChksum, pongID, pongSeqnr = \
                  unpack("bbHHh", pongHeader)

        # valid ping packet received?
        if not pongSeqnr == pid:
            pong = None

        # NO data on socket - timeout waiting for answer
        if not pong:
            return False

        return True

    return False

class Heartbeat:
    last_check = 0
    status = 0
    def __init__(self, node=HEARTBEAT_SERVER, probe_time=HEARTBEAT_PROBE_TIME):
        self.node = node
        self.probe_time = probe_time
        return
    def is_alive(self):
        now = int(time.time())
        if self.last_check + self.probe_time > now:
            return self.status

        self.last_check = now
        self.status = False
        try:
            host = socket.gethostbyname(self.node)
            sock = socket.socket(socket.AF_INET, socket.SOCK_RAW, \
                                 socket.getprotobyname("icmp"))
            self.status = pingNode(self.node, sock)
            sock.close()
        except socket.gaierror:
            pass

        return self.status

def cleanup(signum, frame):
    lcd.reset()
    try:
        os.unlink(PIDFILE)
    except:
        pass
    sys.exit(0)


class LoadAvg(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        self.stop_event = threading.Event()
        self.load = 0.0
        self.lock = thread.allocate_lock()

    def stop(self):
        self.stop_event.set()

    def run(self):
        last_probe = 0
        while True:
            if self.stop_event.isSet():
                return
            now = int(time.time())
            if last_probe + 300 < now:
                last_probe = now
                self.lock.acquire()
                self.load = self._updateLoadAvg()
                self.lock.release()
            time.sleep(3)

    def get(self):
        self.lock.acquire()
        load = self.load
        self.lock.release()
        return load

    def _updateLoadAvg(self):
        filename = '/proc/loadavg'
        try:
            line = open(filename).readline()
        except IOError:
            return -1
        try:
            return float(line.split()[2])
        except:
            return -1

class RunSmartUpdate(threading.Thread):
    def __init__(self, disks):
        threading.Thread.__init__(self)
        self.stop_event = threading.Event()
        self.disks = disks

    def stop(self):
        self.stop_event.set()

    def run(self):
        while True:
            if self.stop_event.isSet():
                return
            self.disks.update()
            time.sleep(3)

def main():
    global lcd
    if '-h' in sys.argv or '--help' in sys.argv:
        print __doc__
        sys.exit()

    if os.path.exists(PIDFILE):
        print >>sys.stderr, PIDFILE, 'exists. an lcd deamon must be running.'
        sys.exit(1)

    if '-f' not in sys.argv:            # run in foreground (for debuging)
        mkdaemon.mkDaemon(PIDFILE)

    linux = OSInfo()

    disk = SmartInfo()
    disk_update_thread = RunSmartUpdate(disk)
    disk_update_thread.setDaemon(True)
    disk_update_thread.start()

    load_avg = LoadAvg()
    load_avg.setDaemon(True)
    load_avg.start()

    heartbeat = Heartbeat()

    if linux.hostname != "":
        lcd = LCD("%16s%16s" % (DEFAULT_MSG, linux.hostname[:16].center(16)))

    lcd.writeln(1, linux.macaddr[-16:])

    # clean up pid file and stop running threads
    signal.signal(signal.SIGINT,  cleanup)
    signal.signal(signal.SIGQUIT, cleanup)
    signal.signal(signal.SIGTERM, cleanup)

    lcd.writeln(1, linux.hostname[:16].rjust(16))
    while True:
        lcd.writeln(2, linux.macaddr[-16:])
        time.sleep(1)

        if heartbeat.is_alive():
            lcd.backlight_on()
        else:
            lcd.backlight_off()

        for k, v in sorted(disk.disks.iteritems()):
            dname = k.split('/')[2]
            lcd.writeln(2, "%s:%s%c%3s %5.1f" % (dname, v[1], lcd.DEG, v[0],
                                                 load_avg.get()))
            time.sleep(2)

    lcd.close()

if __name__ == "__main__":
    main()


Comments !