How it works:


~/crontab
0 1 * * Sun /home/billw/bin/dailypope.sh

dailypope.sh
#!/usr/local/bin/bash

cd ~/software/books/benedict-xvi/
wget -m -np -w2 -A "*_en.htm*" http://www.vatican.va/holy_father/benedict_xvi/
/usr/local/bin/math < ~/software/books/benedict-xvi/dailypope.m
scp ~/software/books/benedict-xvi/dailypope.html members.wolfram.com:/home/billw/public_html

dailypope.m (a Mathematica program)
StringReplaceRepeated[s_String, rules___] := FixedPoint[StringReplace[#, rules]&, s];

boilerplate =
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"
                    \"http://www.w3.org/TR/html4/loose.dtd\">
<html>
<head>
<meta http-equiv=\"content-type\" content=\"text-html; charset=utf-8\">
<script>
<!--
function toggle (id) {
el = document.getElementById (id);
cls = el.className;
action = 0;
if (cls.lastIndexOf ('_Co') == cls.length - 3) {
clsPre = cls.substring (0, cls.length - 3);
el.className = clsPre + '_Ex';
action = 1;
}
else if (cls.lastIndexOf ('_Ex') == cls.length - 3) {
clsPre = cls.substring (0, cls.length - 3);
el.className = clsPre + '_Co';
action = 2;
}
if (action == 1) {
img = document.getElementById (id + '_');
img.src = img.src.replace ('closed', 'open');
}
else if (action == 2) {
img = document.getElementById (id + '_');
img.src = img.src.replace ('open', 'closed');
}
}
-->
</script>
<style type='text/css'>
<!--
.Section {display:block; font-family:sans-serif; margin:6 0 12 0; font-size:large;}
div.Section_Co { display:none; padding-top:5px; }
div.Section_Ex { display:block; padding-top:5px; }
.Subsection {display:block; font-family:sans-serif; margin:6 0 12 0; font-size:medium;}
div.Subsection_Co { display:none; padding-top:5px; }
div.Subsection_Ex { display:block; padding-top:5px; }
.Subsubsection {display:block; font-family:monospace; margin:6 0 12 0; font-size:normal;}
div.Subsubsection_Co { display:none; padding-top:5px; }
div.Subsubsection_Ex { display:block; padding-top:5px; }
a:link { text-decoration:none }
a:visited { text-decoration:none }
a:hover { text-decoration:none }
a:active { text-decoration:none }
-->
</style>
</head>
<body>
";

tracker =
"<!-- Start of StatCounter Code -->
<script type=\"text/javascript\">
var sc_project=4343649;
var sc_invisible=0;
var sc_partition=54;
var sc_click_stat=1;
var sc_security=\"52e617d6\";
</script>

<script type=\"text/javascript\" src=\"http://www.statcounter.com/counter/counter.js\"></script><noscript><div class=\"statcounter\"><a title=\"web stats\" href=\"http://www.statcounter.com/free_web_stats.html\" target=\"_blank\"><img class=\"statcounter\" src=\"http://c.statcounter.com/4343649/0/52e617d6/0/\" alt=\"web stats\" ></a></div></noscript>
<!-- End of StatCounter Code --><br><a href=\"http://my.statcounter.com/project/standard/stats.php?project_id=4343649&guest=1\">View Page Stats</a>
";

dailypope[dir_String] :=
  Module[{manifest, res, res2, res3, res4, res5, out = {}, sectionCount = subsectionCount = subsubsectionCount = 0},
    manifest = FileNames[{"*_en.htm*"}, dir, Infinity];

    (* sort manifest by date: date1 -> {file1, file2}, etc. *)
    res = Sort@Reap[Sow[#, Quiet@Check[First[StringCases[#, DatePattern[{"Year", "Month", "Day"}, ""], Infinity]], "nodate"]]& /@ manifest, _, Rule][[2]];

    (* removed undated files - typically index files *)
    res = DeleteCases[res, Rule["nodate", _], Infinity];

    (* collect dates into months *)
    res2 = Sort@Reap[Module[{date, files, shortDate},
      {date, files} = List @@ #;
      shortDate = StringTake[date, 6];
      Sow[#, shortDate]
    ]& /@ res, _, Rule][[2]];

    (* collect months into years *)
    res3 = Sort@Reap[Module[{yearMonth, data, year},
      {yearMonth, data} = List @@ #;
      year = StringTake[yearMonth, 4];
      Sow[#, year]
    ]& /@ res2, _, Rule][[2]];

    (* make dates easier for DateString to recognize: convert yyyymmdd to yyyy-mm-dd *)
    res4 = res3 /. {
      s_String /; StringMatchQ[s, RegularExpression["^[0-9]{8,8}$"]] :> StringTake[s, 4]<>"-"<>StringTake[s, {5,6}]<>"-"<>StringTake[s, -2],
      s_String /; StringMatchQ[s, RegularExpression["^[0-9]{6,6}$"]] :> StringTake[s, 4]<>"-"<>StringTake[s, -2]};

    (* convert yyyy-mm-dd dates to English via DateString *)
    res5 = res4 /. {
      s_String /; StringMatchQ[s, RegularExpression["^[-0-9]{10,10}$"]] :> DateString[s, {"DayNameShort", " ", "Day", " ", "MonthName", " ", "Year"}],
      s_String /; StringMatchQ[s, RegularExpression["^[-0-9]{7,7}$"]] :> DateString[s, {"MonthName"}]};

    AppendTo[out, boilerplate];
    AppendTo[out, "<title>The Daily Pope</title>"];
    AppendTo[out, "<h2>Pope Benedict XVI, Day by Day</h2>"];
    AppendTo[out, "<p>All the works of Pope Benedict XVI from the Vatican website, sorted by date.</p>"];
    AppendTo[out, "<p>The keywords listed for each work are taken from the file's \"subject\" metadata.</p>"];
    AppendTo[out, "<p>The current plan is to update the page every Sunday morning starting at 1am.  It involves a wget session that lasts a few hours, so if the Vatican webmasters object to the periodic and extensive traffic something else may need to be devised.</p>"];

    Module[{year, data1},
      {year, data1} = List @@ #;
      sectionCount++;
      AppendTo[out, "<div class=\"Section\"> <a href=\"javascript:toggle('Section_"<>ToString@sectionCount<>"')\"> <img src=\"http://members.wolfram.com/billw/images/closedGroup.gif\" id=\"Section_"<>ToString@sectionCount<>"_\" border=\"0\" width=\"8\" height=\"8\" />&nbsp;"<>year<>"</a></div>"];
      AppendTo[out, "<div class=\"Section_Co\" id=\"Section_"<>ToString@sectionCount<>"\">"];
      Module[{month, data2},
        {month, data2} = List @@ #;
        subsectionCount++;
        AppendTo[out, "<div class=\"Subsection\"> <a href=\"javascript:toggle('Subsection_"<>ToString@subsectionCount<>"')\">&nbsp;&nbsp;&nbsp;&nbsp;<img src=\"http://members.wolfram.com/billw/images/closedGroup.gif\" id=\"Subsection_"<>ToString@subsectionCount<>"_\" border=\"0\" width=\"8\" height=\"8\" />&nbsp;"<>month<>"</a></div>"];
        AppendTo[out, "<div class=\"Subsection_Co\" id=\"Subsection_"<>ToString@subsectionCount<>"\">"];
        Module[{date, files},
          {date, files} = List @@ #;
          subsubsectionCount++;
          AppendTo[out, "<div class=\"Subsubsection\"> <a href=\"javascript:toggle('Subsubsection_"<>ToString@subsubsectionCount<>"')\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<img src=\"http://localhost/images/closedGroup.gif\" id=\"Subsubsection_"<>ToString@subsubsectionCount<>"_\" border=\"0\" width=\"8\" height=\"8\" />&nbsp;"<>date<>"</a></div>"];
          AppendTo[out, "<div class=\"Subsubsection_Co\" id=\"Subsubsection_"<>ToString@subsubsectionCount<>"\">"];
          AppendTo[out, "<ul>"];
          Module[{file, a, title, keywords},
            file = #;
            (* the coolest part of this whole thing: Import[file, "XMLObject"] *)
            a = Import[file, "XMLObject"];
            title = Quiet@Check[First[Cases[a, XMLElement["meta", {"name" -> "title", "content"-> t_String, ___}, ___] :> t, Infinity]], "Untitled"];
            title = StringReplaceRepeated[title, {"<b>" -> "", "</b>" -> "", "<i>" -> "", "</i>" -> "", "&quot" -> "\""}];
            keywords = Quiet@Check[First[Cases[a, XMLElement["meta", {"name" -> "subject", "content"-> s_String, ___}, ___] :> s, Infinity]], ""];
            keywords = StringReplaceRepeated[keywords,
              {"<b>" -> "", "</b>" -> "", "<i>" -> "", "</i>" -> "", "&quot" -> "\"", "\[CloseCurlyQuote]" -> "'"}
            ];
            keywords = StringSplit[keywords, ", "];
            file = StringReplace[file, "~/software/books/benedict-xvi/" -> "http://"];
            AppendTo[out, "<li><a href=\""<>file<>"\">"<>title<>"</a></li>"];
            AppendTo[out, "<ul>"];
            AppendTo[out, "<li>"<>#<>"</li>"]& /@ keywords;
            AppendTo[out, "</ul>"];
            ]& /@ files;
          AppendTo[out, "</ul>"];
          AppendTo[out, "</div>"];
        ]& /@ data2;
        AppendTo[out, "</div>"];
      ]& /@ data1;
      AppendTo[out, "</div>"];
    ]& /@ res5;

    AppendTo[out, tracker];
    AppendTo[out, "</body>"];

    Export["out.txt", out];
    DeleteFile["dailypope.html"];
    RenameFile["out.txt", "dailypope.html"];
  ]

dir = "~/software/books/benedict-xvi/www.vatican.va/holy_father/benedict_xvi/";
dailypope[dir];

Quit[];

(* this builds .org output *)

    Module[{year, data1},
      {year, data1} = List @@ #;
      AppendTo[out, "* "<>year];
      Module[{month, data2},
        {month, data2} = List @@ #;
        AppendTo[out, "** "<>month];
        Module[{date, files},
          {date, files} = List @@ #;
          AppendTo[out, "*** "<>date];
          Module[{file, a},
            file = #;
            a = Import[file, "XMLObject"];
            title = Quiet@Check[First[Cases[a, XMLElement["meta", {"name" -> "title", "content"-> t_String, ___}, ___] :> t, Infinity]], "Untitled"];
            keywords = StringSplit[
              Quiet@Check[First[Cases[a, XMLElement["meta", {"name" -> "subject", "content"-> s_String, ___}, ___] :> s, Infinity]], ""],
              ", "];
            file = StringReplace[file, "~/software/books/benedict-xvi/" -> "http://"];
            AppendTo[out, "**** "<>"[["<>file<>"]["<>title<>"]]"];
            AppendTo[out, " - "<>#]& /@ keywords;
            ]& /@ files
        ]& /@ data2;
      ]& /@ data1;
    ]& /@ res5;