How it works:
0 1 * * Sun /home/billw/bin/dailypope.sh
#!/usr/local/bin/bash
cd ~/software/books/benedict-xvi/
wget -m -np -w2 -A "*_en.htm*" http://www.vatican.va/holy_father/benedict_xvi/
/usr/local/bin/math < ~/software/books/benedict-xvi/dailypope.m
scp ~/software/books/benedict-xvi/dailypope.html members.wolfram.com:/home/billw/public_html
StringReplaceRepeated[s_String, rules___] := FixedPoint[StringReplace[#, rules]&, s]; boilerplate = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"> <html> <head> <meta http-equiv=\"content-type\" content=\"text-html; charset=utf-8\"> <script> <!-- function toggle (id) { el = document.getElementById (id); cls = el.className; action = 0; if (cls.lastIndexOf ('_Co') == cls.length - 3) { clsPre = cls.substring (0, cls.length - 3); el.className = clsPre + '_Ex'; action = 1; } else if (cls.lastIndexOf ('_Ex') == cls.length - 3) { clsPre = cls.substring (0, cls.length - 3); el.className = clsPre + '_Co'; action = 2; } if (action == 1) { img = document.getElementById (id + '_'); img.src = img.src.replace ('closed', 'open'); } else if (action == 2) { img = document.getElementById (id + '_'); img.src = img.src.replace ('open', 'closed'); } } --> </script> <style type='text/css'> <!-- .Section {display:block; font-family:sans-serif; margin:6 0 12 0; font-size:large;} div.Section_Co { display:none; padding-top:5px; } div.Section_Ex { display:block; padding-top:5px; } .Subsection {display:block; font-family:sans-serif; margin:6 0 12 0; font-size:medium;} div.Subsection_Co { display:none; padding-top:5px; } div.Subsection_Ex { display:block; padding-top:5px; } .Subsubsection {display:block; font-family:monospace; margin:6 0 12 0; font-size:normal;} div.Subsubsection_Co { display:none; padding-top:5px; } div.Subsubsection_Ex { display:block; padding-top:5px; } a:link { text-decoration:none } a:visited { text-decoration:none } a:hover { text-decoration:none } a:active { text-decoration:none } --> </style> </head> <body> "; tracker = "<!-- Start of StatCounter Code --> <script type=\"text/javascript\"> var sc_project=4343649; var sc_invisible=0; var sc_partition=54; var sc_click_stat=1; var sc_security=\"52e617d6\"; </script> <script type=\"text/javascript\" src=\"http://www.statcounter.com/counter/counter.js\"></script><noscript><div class=\"statcounter\"><a title=\"web stats\" href=\"http://www.statcounter.com/free_web_stats.html\" target=\"_blank\"><img class=\"statcounter\" src=\"http://c.statcounter.com/4343649/0/52e617d6/0/\" alt=\"web stats\" ></a></div></noscript> <!-- End of StatCounter Code --><br><a href=\"http://my.statcounter.com/project/standard/stats.php?project_id=4343649&guest=1\">View Page Stats</a> "; dailypope[dir_String] := Module[{manifest, res, res2, res3, res4, res5, out = {}, sectionCount = subsectionCount = subsubsectionCount = 0}, manifest = FileNames[{"*_en.htm*"}, dir, Infinity]; (* sort manifest by date: date1 -> {file1, file2}, etc. *) res = Sort@Reap[Sow[#, Quiet@Check[First[StringCases[#, DatePattern[{"Year", "Month", "Day"}, ""], Infinity]], "nodate"]]& /@ manifest, _, Rule][[2]]; (* removed undated files - typically index files *) res = DeleteCases[res, Rule["nodate", _], Infinity]; (* collect dates into months *) res2 = Sort@Reap[Module[{date, files, shortDate}, {date, files} = List @@ #; shortDate = StringTake[date, 6]; Sow[#, shortDate] ]& /@ res, _, Rule][[2]]; (* collect months into years *) res3 = Sort@Reap[Module[{yearMonth, data, year}, {yearMonth, data} = List @@ #; year = StringTake[yearMonth, 4]; Sow[#, year] ]& /@ res2, _, Rule][[2]]; (* make dates easier for DateString to recognize: convert yyyymmdd to yyyy-mm-dd *) res4 = res3 /. { s_String /; StringMatchQ[s, RegularExpression["^[0-9]{8,8}$"]] :> StringTake[s, 4]<>"-"<>StringTake[s, {5,6}]<>"-"<>StringTake[s, -2], s_String /; StringMatchQ[s, RegularExpression["^[0-9]{6,6}$"]] :> StringTake[s, 4]<>"-"<>StringTake[s, -2]}; (* convert yyyy-mm-dd dates to English via DateString *) res5 = res4 /. { s_String /; StringMatchQ[s, RegularExpression["^[-0-9]{10,10}$"]] :> DateString[s, {"DayNameShort", " ", "Day", " ", "MonthName", " ", "Year"}], s_String /; StringMatchQ[s, RegularExpression["^[-0-9]{7,7}$"]] :> DateString[s, {"MonthName"}]}; AppendTo[out, boilerplate]; AppendTo[out, "<title>The Daily Pope</title>"]; AppendTo[out, "<h2>Pope Benedict XVI, Day by Day</h2>"]; AppendTo[out, "<p>All the works of Pope Benedict XVI from the Vatican website, sorted by date.</p>"]; AppendTo[out, "<p>The keywords listed for each work are taken from the file's \"subject\" metadata.</p>"]; AppendTo[out, "<p>The current plan is to update the page every Sunday morning starting at 1am. It involves a wget session that lasts a few hours, so if the Vatican webmasters object to the periodic and extensive traffic something else may need to be devised.</p>"]; Module[{year, data1}, {year, data1} = List @@ #; sectionCount++; AppendTo[out, "<div class=\"Section\"> <a href=\"javascript:toggle('Section_"<>ToString@sectionCount<>"')\"> <img src=\"http://members.wolfram.com/billw/images/closedGroup.gif\" id=\"Section_"<>ToString@sectionCount<>"_\" border=\"0\" width=\"8\" height=\"8\" /> "<>year<>"</a></div>"]; AppendTo[out, "<div class=\"Section_Co\" id=\"Section_"<>ToString@sectionCount<>"\">"]; Module[{month, data2}, {month, data2} = List @@ #; subsectionCount++; AppendTo[out, "<div class=\"Subsection\"> <a href=\"javascript:toggle('Subsection_"<>ToString@subsectionCount<>"')\"> <img src=\"http://members.wolfram.com/billw/images/closedGroup.gif\" id=\"Subsection_"<>ToString@subsectionCount<>"_\" border=\"0\" width=\"8\" height=\"8\" /> "<>month<>"</a></div>"]; AppendTo[out, "<div class=\"Subsection_Co\" id=\"Subsection_"<>ToString@subsectionCount<>"\">"]; Module[{date, files}, {date, files} = List @@ #; subsubsectionCount++; AppendTo[out, "<div class=\"Subsubsection\"> <a href=\"javascript:toggle('Subsubsection_"<>ToString@subsubsectionCount<>"')\"> <img src=\"http://localhost/images/closedGroup.gif\" id=\"Subsubsection_"<>ToString@subsubsectionCount<>"_\" border=\"0\" width=\"8\" height=\"8\" /> "<>date<>"</a></div>"]; AppendTo[out, "<div class=\"Subsubsection_Co\" id=\"Subsubsection_"<>ToString@subsubsectionCount<>"\">"]; AppendTo[out, "<ul>"]; Module[{file, a, title, keywords}, file = #; (* the coolest part of this whole thing: Import[file, "XMLObject"] *) a = Import[file, "XMLObject"]; title = Quiet@Check[First[Cases[a, XMLElement["meta", {"name" -> "title", "content"-> t_String, ___}, ___] :> t, Infinity]], "Untitled"]; title = StringReplaceRepeated[title, {"<b>" -> "", "</b>" -> "", "<i>" -> "", "</i>" -> "", """ -> "\""}]; keywords = Quiet@Check[First[Cases[a, XMLElement["meta", {"name" -> "subject", "content"-> s_String, ___}, ___] :> s, Infinity]], ""]; keywords = StringReplaceRepeated[keywords, {"<b>" -> "", "</b>" -> "", "<i>" -> "", "</i>" -> "", """ -> "\"", "\[CloseCurlyQuote]" -> "'"} ]; keywords = StringSplit[keywords, ", "]; file = StringReplace[file, "~/software/books/benedict-xvi/" -> "http://"]; AppendTo[out, "<li><a href=\""<>file<>"\">"<>title<>"</a></li>"]; AppendTo[out, "<ul>"]; AppendTo[out, "<li>"<>#<>"</li>"]& /@ keywords; AppendTo[out, "</ul>"]; ]& /@ files; AppendTo[out, "</ul>"]; AppendTo[out, "</div>"]; ]& /@ data2; AppendTo[out, "</div>"]; ]& /@ data1; AppendTo[out, "</div>"]; ]& /@ res5; AppendTo[out, tracker]; AppendTo[out, "</body>"]; Export["out.txt", out]; DeleteFile["dailypope.html"]; RenameFile["out.txt", "dailypope.html"]; ] dir = "~/software/books/benedict-xvi/www.vatican.va/holy_father/benedict_xvi/"; dailypope[dir]; Quit[]; (* this builds .org output *) Module[{year, data1}, {year, data1} = List @@ #; AppendTo[out, "* "<>year]; Module[{month, data2}, {month, data2} = List @@ #; AppendTo[out, "** "<>month]; Module[{date, files}, {date, files} = List @@ #; AppendTo[out, "*** "<>date]; Module[{file, a}, file = #; a = Import[file, "XMLObject"]; title = Quiet@Check[First[Cases[a, XMLElement["meta", {"name" -> "title", "content"-> t_String, ___}, ___] :> t, Infinity]], "Untitled"]; keywords = StringSplit[ Quiet@Check[First[Cases[a, XMLElement["meta", {"name" -> "subject", "content"-> s_String, ___}, ___] :> s, Infinity]], ""], ", "]; file = StringReplace[file, "~/software/books/benedict-xvi/" -> "http://"]; AppendTo[out, "**** "<>"[["<>file<>"]["<>title<>"]]"]; AppendTo[out, " - "<>#]& /@ keywords; ]& /@ files ]& /@ data2; ]& /@ data1; ]& /@ res5;