#!/usr/bin/env python
# Copyright (c) 2009 Dan Carley <[email protected]>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
"""
Report failures from hardware RAID controllers.
Requires the supporting utilities:
mpt-status(8) for MPT controllers.
tw_cli(8) for 3ware controllers.
Intended to be scheduled from crontab as follows:
MAILTO="[email protected]"
0 */3 * * * /usr/local/sbin/hwraid_monitor.py options
"""
from re import search
from sys import exit
from os.path import isfile
from optparse import OptionParser
from subprocess import Popen, PIPE
def check_controller(type):
ret = True
if type == 'mpt':
cmd = [ '/usr/sbin/mpt-status', '-s' ]
array = {'regex': '^log_id$',
'pos': 2,
'string': 'OPTIMAL'}
drive = {'regex': '^phys_id$',
'pos': 2,
'string': 'ONLINE'}
elif type == 'tw':
cmd = [ '/sbin/tw_cli', 'info' ]
contr = {'regex': '^c\d+$'}
array = {'regex': '^u\d+$',
'pos': 2,
'string': 'OK'}
drive = {'regex': '^p\d+$',
'pos': 1,
'string': 'OK'}
if not isfile(cmd[0]):
print "%s: Utility not found" % cmd[0]
return False
if type == 'tw':
controllers = []
p = Popen(cmd, stdout=PIPE)
o, e = p.communicate()
if e:
print e
for c in o.split('\n'):
c = c.split()
if len(c) > 2 and search(contr['regex'], c[0]):
controllers.append(c[0])
elif type == 'mpt':
controllers = ['']
for c in controllers:
p = Popen(cmd + [c], stdout=PIPE)
o, e = p.communicate()
if e:
print e.split('\n')
for v in o.split('\n'):
v = v.split()
if len(v) > 2:
# Array check.
if search(array['regex'], v[0]) and v[array['pos']] != array['string']:
print "Array failure: \n\t%s" % '\t'.join(v)
ret = False
# Drive check.
if search(drive['regex'], v[0]) and v[drive['pos']] != drive['string']:
print "Drive failure: \n\t%s" % '\t'.join(v)
ret = False
return ret
def main():
usage = "usage: %prog options"
parser = OptionParser(usage=usage)
parser.add_option("--mpt", action="store_true", default=False,
dest="mpt", help="MPT controller support.")
parser.add_option("--tw", action="store_true", default=False,
dest="tw", help="3ware controller support.")
(options, args) = parser.parse_args()
if not options.mpt and not options.tw:
parser.print_help()
exit(2)
fail = False
if options.mpt:
if not check_controller('mpt'):
fail = True
if options.tw:
if not check_controller('tw'):
fail = True
if fail:
exit(1)
if __name__ == "__main__":
main()
如果您将软件 RAID 与常规磁盘控制器一起使用,请使用:
例如 /dev/md0 在哪里。这将向您显示当前状态。如果驱动器出现故障,您还会在 /var/log/messages 中看到很多令人讨厌的东西。
它依赖于突袭。对于 lsi [它在很多戴尔和惠普服务器中],您使用名为MegaCLI的工具。
用于 3ware 卡 - tw_cli
它通常带有硬件的“驱动程序”或文档。
如果这是一次软件突袭 (mdadm),并且您想查看当前状态,您可以简单地执行cat /proc/mdstat。如果您想要每 10 秒刷新一次屏幕,您可以执行watch -n 10 cat /proc/mdstat。
戴尔可能会提供一个工具来监控它,但我可以猜测它会像大多数乏善可陈的 OEM 实用程序一样使用 Java 实现并膨胀。
幸运的是,出色的mpt-status实用程序似乎支持 SC1435。只需确保您在内核中启用了以下选项:
然后,您可以使用 CLI 中的 mpt-status 查看 RAID 阵列的运行状况。
然后我个人使用一个从 cron 调用的简单 Python 脚本,它定期检查状态并通过电子邮件向我们发出警报。与 mdadm 的行为方式类似。您当然可以指定您希望检查的频率。随意使用它自己:
mdadm 提供有关 Linux 软件突袭各个方面的详细信息。
watch -n 10 cat /proc/mdstat 这将在你的系统上设置