<> <> <> DIRECTORY Arpa USING [nullAddress, Address], ArpaName USING [AddressToName, NameToAddress, ReplyStatus], ArpaNameSupport USING [AddressInList], ArpaNameQuery, BasicTime USING [GMT, Now], ConvertExtras USING [RopeFromArpaAddress], DFUtilities USING [DateToRope], GVSend USING [AddRecipient, AddToItem, CheckValidity, Create, Handle, Send, StartSend, StartSendInfo, StartText], IO USING [Flush, PutF, PutFR, PutRope, STREAM, time, Value], Process USING [Detach, SecondsToTicks], Rope USING [Cat, Equal, ROPE], UserCredentials USING [Get]; ArpaNameServerWatcher: CEDAR MONITOR IMPORTS ArpaName, ArpaNameSupport, ArpaNameQuery, BasicTime, ConvertExtras, DFUtilities, GVSend, IO, Process, Rope, UserCredentials = BEGIN ROPE: TYPE = Rope.ROPE; STREAM: TYPE = IO.STREAM; started: BasicTime.GMT _ BasicTime.Now[]; senderPwd: ROPE = UserCredentials.Get[].password; sender: ROPE = UserCredentials.Get[].name; recipient: ROPE _ "ArpaSupport^.pa"; log: STREAM _ NIL; <<_ ArpaNameServerWatcher.log _ ViewerIO.CreateViewerStreams["ArpaNameServerWatcher"].out;>> <<_ ArpaNameServerWatcher.PrintServerStats[ViewerIO.CreateViewerStreams["Stats"].out, "*"]>> interval: CARDINAL _ 1*10*90; -- interval between queries (15 mins) abort: BOOLEAN _ FALSE; debug: BOOLEAN _ FALSE; stats: BOOLEAN _ FALSE; TestRecord: TYPE = RECORD[ server: ROPE, queryRope: ROPE, expectAddrs: LIST OF Arpa.Address, failed: BOOLEAN, sendMessage: BOOLEAN, timeOut: INT, histogram: Histogram]; -- in milliSecs tests: LIST OF TestRecord _ LIST[ [ server: "arisia.Xerox.COM", queryRope: "Xerox.com", expectAddrs: LIST[[13,0,12,232]], failed: FALSE, sendMessage: TRUE, timeOut: 60000, histogram: NEW[HistogramRecord]], [ server: "sonora.dec.com", queryRope: "Xerox.com", expectAddrs: LIST[[13,0,12,232]], failed: FALSE, sendMessage: FALSE, timeOut: 60000, histogram: NEW[HistogramRecord]], [ server: "palain.parc.Xerox.COM", queryRope: "palain.parc.Xerox.COM", expectAddrs: LIST[[13,1,100,208]], failed: FALSE, sendMessage: TRUE, timeOut: 60000, histogram: NEW[HistogramRecord]], [ server: "pooh.parc.Xerox.COM", queryRope: "pooh.parc.Xerox.COM", expectAddrs: LIST[[13,2,16,167]], failed: FALSE, sendMessage: TRUE, timeOut: 60000, histogram: NEW[HistogramRecord]] ]; sequenceNumber: CARDINAL _ 0; histogramSlots: INT = 300; histogramSlotSize: INT = 100; -- ms/slot Histogram: TYPE = REF HistogramRecord; HistogramRecord: TYPE = RECORD [ probes: INT _ 0, lost: INT _ 0, counters: ARRAY [0..histogramSlots) OF INT _ ALL[0] ]; Query: PROC [server: ROPE, queryRope: ROPE, expectAddrs: LIST OF Arpa.Address, timeOut: INT, histogram: Histogram] RETURNS[failed: BOOLEAN_ FALSE, how: ROPE _ NIL] = { badReply: BOOLEAN _ FALSE; gotResponse: BOOLEAN _ FALSE; where: Arpa.Address; reply: ArpaNameQuery.Reply; status: ArpaName.ReplyStatus; Report[log, "\nQuerying ", server]; [where, status,] _ ArpaName.NameToAddress[server]; IF status = down THEN {Report[log, "Can't load name Cache."]; RETURN; }; IF status # ok OR where = Arpa.nullAddress THEN {Report[log, "Can't load address for server."]; RETURN; }; Report[log, " = ", ConvertExtras.RopeFromArpaAddress[where], ".\n"]; reply _ ArpaNameQuery.Query[server: where, query: queryRope, timeout: timeOut]; histogram.probes _ histogram.probes + 1; IF reply = NIL THEN { -- retry once Report[log, "No response.", "\n"]; histogram.lost _ histogram.lost + 1; reply _ ArpaNameQuery.Query[server: where, query: queryRope, timeout: timeOut]; histogram.probes _ histogram.probes + 1; }; IF reply # NIL THEN { IF reply.source # where THEN { IF log ~= NIL THEN IO.PutF[log, "Strange source address: expected %G, found %G = %G: \n", [rope[ConvertExtras.RopeFromArpaAddress[where]]], [rope[ConvertExtras.RopeFromArpaAddress[reply.source]]], [rope[ArpaName.AddressToName[reply.source].name]]]; RETURN[FALSE, Rope.Cat["Strange source address: ", ConvertExtras.RopeFromArpaAddress[reply.source], " = ", ArpaName.AddressToName[reply.source].name]]}; [badReply, how] _ CheckDomainPacket[queryRope, expectAddrs, reply]; IF badReply THEN { IF log ~= NIL THEN { Report[log, "Incorrect Reply: ", how, "\n"]; IO.PutF[log, "Response time was %G ms.\n", [integer[reply.responseTime]]]}; RETURN[TRUE, how]} ELSE Report[log, "Reply: ok, "]; IF log ~= NIL THEN IO.PutF[log, "The response time was %G ms.\n", [integer[reply.responseTime]]]; AddToCounter[histogram, reply.responseTime]; } ELSE { histogram.lost _ histogram.lost + 1; Report[log, "No response.", "\n"]; RETURN[TRUE]}; Report[log, "\n"]; RETURN[FALSE] }; AddToCounter: PROC [histogram: Histogram, ms: INT] = { index: INT; ms _ ms + histogramSlotSize - 1; index _ ms/histogramSlotSize; IF index >= histogramSlots THEN index _ histogramSlots-1; histogram.counters[index] _ histogram.counters[index] + 1; }; PrintStats: PROC [log: IO.STREAM, histogram: Histogram] = { running: INT _ 0; now: BasicTime.GMT _ BasicTime.Now[]; nowRope: ROPE _ DFUtilities.DateToRope[[format: explicit, gmt: now]]; startedRope: ROPE _ DFUtilities.DateToRope[[format: explicit, gmt: started]]; Report[log, "Started: ", startedRope, "\n"]; Report[log, "Ended: ", nowRope, "\n"]; IO.PutF[log, " Queries: %G.\n", [integer[histogram.probes]] ]; IF histogram.probes = 0 THEN RETURN; IO.PutF[log, " No reply: %G, %1.2F%%.\n", [integer[histogram.lost]], [real[100.0*histogram.lost/histogram.probes]] ]; IO.PutRope[log, " Response time histogram:\n"]; FOR i: INT IN [0..histogramSlots) DO counter: INT _ histogram.counters[i]; milliseconds: INT _ i*histogramSlotSize; counterPerCent, runningPerCent: REAL; IF counter = 0 THEN LOOP; running _ running + counter; counterPerCent _ 100.0*counter/histogram.probes; runningPerCent _ 100.0*running/histogram.probes; IO.PutF[log, "%7G %7.2F %7.2F %7G\n", [integer[counter]], [real[counterPerCent]], [real[runningPerCent]], [integer[milliseconds]] ]; ENDLOOP; IO.PutRope[log, "\n"]; IO.Flush[log]; }; PrintServerStats: PROC [log: IO.STREAM, server: Rope.ROPE] = { FOR list: LIST OF TestRecord _ tests, list.rest UNTIL list = NIL DO this: ROPE _ list.first.server; IF Rope.Equal[this, server, FALSE] OR Rope.Equal[server, "*", FALSE] THEN {Report[log, "Server: ", this, "\n"]; PrintStats[log, list.first.histogram]; Report[log, "\n"]; IO.Flush[log]}; ENDLOOP; }; CheckDomainPacket: PROC [queryRope: ROPE, expectAddrs: LIST OF Arpa.Address, reply: ArpaNameQuery.Reply] RETURNS[failed: BOOLEAN _ FALSE, how: ROPE _ NIL] = TRUSTED { SELECT reply.hdr.qr FROM response => IF debug THEN Report[log, "qr: response, "]; ENDCASE => RETURN[TRUE, "qr # response."]; SELECT reply.hdr.opcode FROM query => IF debug THEN Report[log, "op: query"]; ENDCASE => RETURN[TRUE, "op # query."]; IF debug AND log ~= NIL THEN IO.PutF[log, ", length: %G bytes.\n", [integer[reply.domainPacketLength]]]; IF debug AND log ~= NIL THEN IO.PutF[log, "aa: %G, tc: %G, rd: %G, ra: %G\n", [boolean[reply.hdr.authoritative]], [boolean[reply.hdr.truncated]], [boolean[reply.hdr.recurDesired]], [boolean[reply.hdr.recurAvail]]]; SELECT reply.hdr.rcode FROM ok => IF debug THEN Report[log, "op: ok"]; format => RETURN[TRUE, "Format error."]; serverFailed => RETURN[TRUE, "serverFailed."]; nameError => RETURN[TRUE, "nameError."]; notImplemented => RETURN[TRUE, "notImplemented."]; refused => RETURN[TRUE, "refused."]; ENDCASE => RETURN[TRUE, "unknown."]; IF reply.hdr.truncated THEN {Report[log, " ** TRUNCATED **"]; RETURN[TRUE, "Truncated."];}; IF debug THEN Report[log, "\n"]; IF debug AND log ~= NIL THEN IO.PutF[log, "qdCount: %G, anCount: %G, nsCount: %G, arCount: %G\n", [integer[reply.hdr.qdCount]], [integer[reply.hdr.anCount]], [integer[reply.hdr.nsCount]], [integer[reply.hdr.arCount]]]; IF reply.hdr.qdCount # 1 THEN RETURN[TRUE, "qdCount # 1"]; IF debug AND log ~= NIL THEN IO.PutF[log, "Query Name: \"%G\", ", [rope[reply.questions[0].name]]]; IF reply.questions[0].type # a THEN RETURN[TRUE, "Incorrect query type."]; IF reply.questions[0].class # in THEN RETURN[TRUE, "Incorrect query class."]; IF reply.anCount # 1 THEN RETURN[TRUE, "anCount # 1"]; WITH reply.answers[0] SELECT FROM rr: ArpaNameQuery.ARR => { IF ~Rope.Equal[rr.name, queryRope, FALSE] THEN RETURN[TRUE, "Incorrect response name."]; IF rr.type # a THEN RETURN[TRUE, "Incorrect response type."]; IF rr.class # in THEN RETURN[TRUE, "Incorrect response class."]; IF rr.dataLength # 4 THEN RETURN[TRUE, "Funny Length.\n"]; IF ~ArpaNameSupport.AddressInList[expectAddrs, rr.address] THEN RETURN[TRUE, "Incorrect address."]; }; ENDCASE => RETURN[TRUE, ", Bogus RR.\n"]; IF reply.hdr.nsCount # 0 THEN RETURN[TRUE, "nsCount # 0"]; IF reply.hdr.arCount # 0 THEN RETURN[TRUE, "arCount # 0"]; }; Report: PROC [log: IO.STREAM, r1, r2, r3, r4: ROPE _ NIL] = TRUSTED { IF log = NIL THEN RETURN; IF r1 # NIL THEN {IO.PutRope[log, r1]}; IF r2 # NIL THEN {IO.PutRope[log, r2]}; IF r3 # NIL THEN {IO.PutRope[log, r3]}; IF r4 # NIL THEN {IO.PutRope[log, r4]}; }; SendTheMessage: PROC [server: ROPE, justFailed: BOOLEAN, how: ROPE_ NIL] = { msg: ROPE; handle: GVSend.Handle; ssi: GVSend.StartSendInfo; msg _ Rope.Cat[msg, "Date: ", IO.PutFR["%G", IO.time[]], "\n"]; msg _ Rope.Cat[msg, "From: Mailer.pa (ArpaGateway NameWatcher)\n"]; msg _ Rope.Cat[msg, "Subject: ", server]; msg _ Rope.Cat[msg, SELECT TRUE FROM ~justFailed => " name server back up.\n", how # NIL => " name server problem.\n", ENDCASE => " name server not responding.\n" ]; msg _ Rope.Cat[msg, "To: ", recipient, "\n"]; msg _ Rope.Cat[msg, "\n"]; IF how ~= NIL THEN msg _ Rope.Cat[msg, "Problem: ", how, "\n"] ELSE {IF justFailed THEN msg _ Rope.Cat[msg, "Problem could be with the ArpaGateway, a cisco router, or the name server on ", server, ".\n\nTo restart the name server; as superuser kill off the old one if it is still running and type \"/usr/etc/in.named\".\n"]}; handle _ GVSend.Create[]; ssi _ GVSend.StartSend[ handle: handle, senderPwd: senderPwd, sender: sender, returnTo: NIL, validate: TRUE ]; IF ssi ~= ok THEN RETURN; GVSend.AddRecipient[handle, recipient]; IF GVSend.CheckValidity[handle, NIL] # 1 THEN ERROR; GVSend.StartText[handle]; GVSend.AddToItem[handle, msg]; GVSend.Send[handle]; }; CheckNameServer: PROC = { how: ROPE _ NIL; FOR list: LIST OF TestRecord _ tests, list.rest UNTIL list = NIL DO server: ROPE _ list.first.server; wasFailed: BOOLEAN _ list.first.failed; IF server = NIL THEN EXIT; [list.first.failed, how] _ Query[server, list.first.queryRope, list.first.expectAddrs, list.first.timeOut, list.first.histogram]; IF list.first.sendMessage THEN { IF list.first.failed AND ~wasFailed THEN { IF log # NIL THEN Report[log, "Sending failure message to: ", recipient, "\n"]; SendTheMessage[server, TRUE, how]} ELSE IF wasFailed AND ~list.first.failed THEN { IF log # NIL THEN Report[log, "Sending up message to: ", recipient, "\n"]; SendTheMessage[server, FALSE]}}; IF log ~= NIL AND stats THEN {PrintStats[log, list.first.histogram]; IO.Flush[log]}; ENDLOOP; }; Background: ENTRY PROC = { ENABLE UNWIND => NULL; DO snooz: CONDITION _ [timeout: Process.SecondsToTicks[interval]]; WAIT snooz; IF abort THEN {Report[log, "\n", "Aborted.", "\n"]; EXIT}; CheckNameServer[]; ENDLOOP; }; TRUSTED {Process.Detach[FORK Background[]]; }; END.