At work, I manage a cluster of several hundred nodes. All those nodes are equipped with a little LCD display, like to one showed in the picture on the right. Until now, these displays were only displaying the name of the cluster vendor at boot, and the node name when the machine was up an running. To put these neat little displays to better use, I wrote a Python program named lcd.py, which prints more useful information.
When started, lcd.py burns into the MVRAM of the display the name of the company I am working for and the name of the node, which is in fact the position of the node in the rack. This helps me find which node is which when I take several of them out of the rack for maintenance. I just power the machine for a few seconds to read its name and position on the racks on the LCD display.
Then, the daemon goes into an infinite loop to probe the disk temperature using the S.M.A.R.T information provided by the disk. This information is then displayed on the second line of the LCD display. A quick look at the display and I know if something is wrong with the disk or the machine. Of course this is not the only monitoring system used to watch the system but every little detail helps. And it is fun to use these little displays.
The program also pings the “master node” to see if there are any network problems. If the “master node” cannot be reached, the program turns off the LCD display backlight. A quick glance at the entire rack and you know which node is in trouble.
For more information on these LCD display you can download the manual
Download the entire source code
#!/usr/bin/env python
#
"""
Usage: lcd [options]
Options:
-h, --help
print this text.
-f, --foreground
run the progrand in foreground (for debugging purpose)
lcd is a deamon who print the health health status of the petabox on
the small lcd display.
- The first line display the home name and mac address of the node.
- The second line prints the disk temperature.
- The display backlignt goes off it the network is down.
"""
import array
import commands
import os
import signal
import socket, fcntl
import sys
import termios
import time
import select
import threading, thread
from struct import pack, unpack, calcsize
from lib import mkdaemon
notOpen = "Serial port not open"
DISK_PROBE_TIME = 900
DEFAULT_TTY = '/dev/ttyS1'
PIDFILE = '/var/run/lcd.pid'
WORKDIR = '/'
UMASK = 0
MAXFD = 1024
SMART_DISKNOTFOUND = 0x02
DEFAULT_MSG = " Twitter " + " @0x9900 "
ICMP_TYPE = 8
ICMP_CODE = 0
ICMP_CHECKSUM = 0
ICMP_ID = 0
ICMP_SEQ_NR = 0
HEARTBEAT_PACKET_SIZE = 56
HEARTBEAT_PROBE_TIME = 10
HEARTBEAT_SERVER = 'gateway.home'
class SerialError(Exception):
"""Base class for Serial related exceptions."""
pass
class Serial(object):
"""
Quick and dirty Serial class to communicate with the petabox small
LCD display
"""
def __init__(self, port):
self.port = port
try:
self.fd = os.open(port, os.O_RDWR|os.O_NOCTTY|os.O_NONBLOCK)
except Exception, msg:
self.fd = None
raise SerialError("could not open port %s: %s" % (port, msg))
try:
self._configPort()
except Exception, msg:
os.close(self.fd)
self.fd = None
raise SerialError("could not configur port %s: %s" % (port, msg))
return
def close(self):
"""Close port"""
if self.fd is not None:
os.close(self.fd)
self.fd = None
return
def read(self, size=1):
if self.fd is None:
raise SerialError(notOpen)
read = ''
inp = None
if size > 0:
buf = os.read(self.fd, size-len(read))
read = read + buf
return read
def write(self, data):
"""Output the given string over the serial port."""
if self.fd is None:
raise SerialError(notOpen)
t = len(data)
d = data
while t > 0:
n = os.write(self.fd, d)
d = d[n:]
t = t - n
return
def flush(self):
if self.fd is None:
raise SerialError(notopen)
termios.tcdrain(self.fd)
return
def _configPort(self):
try:
iflag, oflag, cflag, lflag, ispeed, ospeed, cc = \
termios.tcgetattr(self.fd)
except termios.error, msg:
raise SerialError("Could not configure port: %s" % msg)
#set up raw mode / no echo / binary
cflag |= (termios.CLOCAL | termios.CREAD | termios.CS8)
lflag &= ~(termios.ICANON | termios.ECHO | termios.ECHOE |
termios.ECHOK | termios.ECHONL | termios.ISIG |
termios.IEXTEN | termios.ECHOCTL | termios.ECHOKE)
oflag &= ~(termios.OPOST)
iflag &= ~(termios.INLCR | termios.IGNCR | termios.ICRNL |
termios.IGNBRK | termios.IXON | termios.IXOFF |
termios.IXANY)
ispeed = ospeed = getattr(termios,'B19200')
termios.tcsetattr(self.fd, termios.TCSANOW,
[iflag, oflag, cflag, lflag, ispeed, ospeed, cc])
class LCD(Serial):
# The text codes can be found at
# http://www.matrixorbital.ca/manuals/LK_series/LK162-12/LK162-12_200.pdf
def __init__(self, msg=DEFAULT_MSG):
"""
Initialise the serial connection and set the LCD display
with the boot message.
"""
super(LCD, self).__init__(DEFAULT_TTY)
self.bootmsg = msg[:32]
self.write("%c%c%s" % (0xFE, 0x40, self.bootmsg))
# try to determine the deg characterh by checking the version
self.DEG = 0xB2
try:
self.write("%c%c" % (0xFE, 0x36))
time.sleep(1)
if self.read() == '1':
self.DEG = 0xDF
except:
pass
return
def reset(self):
super(LCD, self).write(self.bootmsg)
return
def clear(self):
"""
Clear the screen
"""
self.write("%c%c" % (0xfe, 0x58))
self.flush()
return
def backlight_on(self):
"""
turn on the display's backlight
"""
self.write("%c%c%c" % (0xfe, 0x42, 0x0))
return
def backlight_off(self):
"""
turn off the display's backlight
"""
self.write("%c%c" % (0xfe, 0x46))
return
def write(self, msg):
super(LCD, self).write(msg)
return
def writeln(self, lineno, msg):
"""
Write the text 'msg' to a given line. There is two lines [1-2]
"""
line = "%c%c%c%c%-16.16s" % (0xFE, 0x47, 1, lineno, msg[:16])
self.write(line)
return
class SmartError(Exception):
"""SmartInfo errors handler"""
pass
class SmartInfo:
"""
Get the disk smart information
"""
def __init__(self):
self.disks = {}
self.last_probe = 0
self.smart = '/usr/sbin/smartctl'
self.args = "-AHd ata"
self.initialized = True
# Get the disks names
(status, output) = commands.getstatusoutput('fdisk -l')
if status != 0:
self.initialized = False
return
for line in output.split('\n'):
if not line.startswith('Disk /dev'): continue
fields = line.split()
self.disks.setdefault(fields[1][:-1], '*')
self.update()
return
def update(self):
"""
Update the smart information if DISK_PROBE_TIME is reached.
"""
if self.initialized == False:
raise SmartError('SMART: init err') # 16 char to fit the screen
now = int(time.time())
if self.last_probe + DISK_PROBE_TIME > now:
return
self.last_probe = now
for disk in self.disks.keys():
self.disks[disk] = self._probedisk(disk)
def _probedisk(self, disk):
# get disk temp
health = "NotFound"
temperature = "0"
cmd = ' '.join((self.smart, self.args, disk))
(status, output) = commands.getstatusoutput(cmd)
if status & SMART_DISKNOTFOUND:
return ('Unknown', '*')
lines = [l.split() for l in output.split('\n') if l[:-1] != '']
for fields in lines:
if fields[1].lower() == "overall-health":
if fields[-1].upper() == 'PASSED':
health = 'OK'
else:
health = 'ERR'
break
# get disk the temperature
for fields in lines:
try:
id = int(fields[0])
except ValueError:
continue
if id == 194:
temperature = fields[9]
break;
return (health, temperature)
class OSInfo:
"""Get os information"""
def __init__(self):
self.macaddr = self.get_macaddr('eth0')
self.hostname = self.get_hostname()
def get_macaddr(self, iface):
"""
Get the mac address for the interface iface
"""
SIOCGIFHWADDR = 0x8927 # Value from /usr/include/linux/sockios.h
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
req = pack('16s16s', iface, '')
try:
info = fcntl.ioctl(s.fileno(), SIOCGIFHWADDR, req)
except IOError:
return "00:00:00:00:00:00"
return ':'.join(["%0.2X" % v for v in unpack('BBBBBB',info[18:24])])
def get_hostname(self):
name = socket.gethostname()
try:
return name.split('.')[0]
except:
return name
# This version of ping is the stripped down version of the ping.py by
# Lars Strand <lars strand at gnist org> we only need to see if the
# network is up and running.
def _construct(id):
"""
Constructs a ICMP echo packet of variable size
"""
# construct header
header = pack('bbHHh', ICMP_TYPE, ICMP_CODE, ICMP_CHECKSUM, \
ICMP_ID, ICMP_SEQ_NR+id)
# space for time
size = HEARTBEAT_PACKET_SIZE - calcsize("d")
data = pack("d", time.time()) + 'X' * size
packet = header + data # ping packet without checksum
checksum = _in_cksum(packet) # make checksum
# construct header with correct checksum
header = pack('bbHHh', ICMP_TYPE, ICMP_CODE, checksum, ICMP_ID, \
ICMP_SEQ_NR+id)
# ping packet *with* checksum
packet = header + data
# a perfectly formatted ICMP echo packet
return packet
def _in_cksum(packet):
"""THE RFC792 states: 'The 16 bit one's complement of
the one's complement sum of all 16 bit words in the header.'
Generates a checksum of a (ICMP) packet. Based on in_chksum found
in ping.c on FreeBSD.
"""
# add byte if not dividable by 2
if len(packet) & 1:
packet = packet + '\0'
# split into 16-bit word and insert into a binary array
words = array.array('h', packet)
sum = 0
# perform ones complement arithmetic on 16-bit words
for word in words:
sum += (word & 0xffff)
hi = sum >> 16
lo = sum & 0xffff
sum = hi + lo
sum = sum + (sum >> 16)
return (~sum) & 0xffff # return ones complement
def pingNode(node, sock, timeout=1.0):
"""
Pings a node based on input given to the function.
returns False for dead, and True when alive.
"""
pid = os.getpid()
packet = _construct(pid) # make a ping packet
# send the ping
try:
sock.sendto(packet,(node,1))
except socket.error, e:
return False
# reset values
pong = ""; iwtd = []
# wait until there is data in the socket
while 1:
# input, output, exceptional conditions
iwtd, owtd, ewtd = select.select([sock], [], [], timeout)
break # no data and timout occurred
# data on socket - this means we have an answer
if iwtd: # ok, data on socket
# read data (we only need the header)
pong, address = sock.recvfrom(HEARTBEAT_PACKET_SIZE + 48)
# fetch pong header
pongHeader = pong[20:28]
pongType, pongCode, pongChksum, pongID, pongSeqnr = \
unpack("bbHHh", pongHeader)
# valid ping packet received?
if not pongSeqnr == pid:
pong = None
# NO data on socket - timeout waiting for answer
if not pong:
return False
return True
return False
class Heartbeat:
last_check = 0
status = 0
def __init__(self, node=HEARTBEAT_SERVER, probe_time=HEARTBEAT_PROBE_TIME):
self.node = node
self.probe_time = probe_time
return
def is_alive(self):
now = int(time.time())
if self.last_check + self.probe_time > now:
return self.status
self.last_check = now
self.status = False
try:
host = socket.gethostbyname(self.node)
sock = socket.socket(socket.AF_INET, socket.SOCK_RAW, \
socket.getprotobyname("icmp"))
self.status = pingNode(self.node, sock)
sock.close()
except socket.gaierror:
pass
return self.status
def cleanup(signum, frame):
lcd.reset()
try:
os.unlink(PIDFILE)
except:
pass
sys.exit(0)
class LoadAvg(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.stop_event = threading.Event()
self.load = 0.0
self.lock = thread.allocate_lock()
def stop(self):
self.stop_event.set()
def run(self):
last_probe = 0
while True:
if self.stop_event.isSet():
return
now = int(time.time())
if last_probe + 300 < now:
last_probe = now
self.lock.acquire()
self.load = self._updateLoadAvg()
self.lock.release()
time.sleep(3)
def get(self):
self.lock.acquire()
load = self.load
self.lock.release()
return load
def _updateLoadAvg(self):
filename = '/proc/loadavg'
try:
line = open(filename).readline()
except IOError:
return -1
try:
return float(line.split()[2])
except:
return -1
class RunSmartUpdate(threading.Thread):
def __init__(self, disks):
threading.Thread.__init__(self)
self.stop_event = threading.Event()
self.disks = disks
def stop(self):
self.stop_event.set()
def run(self):
while True:
if self.stop_event.isSet():
return
self.disks.update()
time.sleep(3)
def main():
global lcd
if '-h' in sys.argv or '--help' in sys.argv:
print __doc__
sys.exit()
if os.path.exists(PIDFILE):
print >>sys.stderr, PIDFILE, 'exists. an lcd deamon must be running.'
sys.exit(1)
if '-f' not in sys.argv: # run in foreground (for debuging)
mkdaemon.mkDaemon(PIDFILE)
linux = OSInfo()
disk = SmartInfo()
disk_update_thread = RunSmartUpdate(disk)
disk_update_thread.setDaemon(True)
disk_update_thread.start()
load_avg = LoadAvg()
load_avg.setDaemon(True)
load_avg.start()
heartbeat = Heartbeat()
if linux.hostname != "":
lcd = LCD("%16s%16s" % (DEFAULT_MSG, linux.hostname[:16].center(16)))
lcd.writeln(1, linux.macaddr[-16:])
# clean up pid file and stop running threads
signal.signal(signal.SIGINT, cleanup)
signal.signal(signal.SIGQUIT, cleanup)
signal.signal(signal.SIGTERM, cleanup)
lcd.writeln(1, linux.hostname[:16].rjust(16))
while True:
lcd.writeln(2, linux.macaddr[-16:])
time.sleep(1)
if heartbeat.is_alive():
lcd.backlight_on()
else:
lcd.backlight_off()
for k, v in sorted(disk.disks.iteritems()):
dname = k.split('/')[2]
lcd.writeln(2, "%s:%s%c%3s %5.1f" % (dname, v[1], lcd.DEG, v[0],
load_avg.get()))
time.sleep(2)
lcd.close()
if __name__ == "__main__":
main()