Collaborama About Help Contact Anonymous [login] Source: site.view [edit] Function name: buildSearchEngine Arguments: Description: Compile the exported dataset into a search index. Page type: webl Render function: Module: perfectCartoon Page source: var fi = Wub_GetFunctionInfo("cartoonDBSettings"); var settings = WubEval(fi.exec); var writer = Wub_NewLuceneIndex(settings.staging, true, settings.stemmer); // Try it three times to increase robustness? var P = GetURL(settings.dbUrl, nil, nil, [. mimetype="text/plain", autoredirect=true .]) ? GetURL(settings.dbUrl, nil, nil, [. mimetype="text/plain", autoredirect=true .]) ? GetURL(settings.dbUrl, nil, nil, [. mimetype="text/plain", autoredirect=false .]); var synonyms = [. .]; var SynP = Markup(GetURL("https://docs.google.com/spreadsheets/d/e/2PACX-1vSJQvhH7JF3V6CgIopHLNrgON7h7OIFhx30dq0cZAg2ukOUgWlsNOolD9803OLWNrohcsWdwAtb0cFG/pub?gid=0&single=true&output=csv", nil, nil, [. mimetype="text/plain", autoredirect=true .])) ? ""; every line in Str_Split(SynP,"\n") do var cols = []; every col in Str_Split(line, ",") do col = Str_Trim(Wub_ReplaceAll(col, `"`, "")); if col != "" then cols = cols + [col] end end; if Size(cols) > 1 then var syn = ""; every s in Rest(cols) do syn = syn + " " + s end; synonyms[Str_ToLowerCase(First(cols))] := Str_Trim(syn) end end; var numDocs = 0; var outS = "?"; var ok = (writer != nil); var addDebug = true; var tagSet = [. .]; var tagSetFr = [. .]; var authorSet = [. .]; var scoreSet = [. .]; var URL=0; var IMG=1; var CAPTION=2; var AUTHOR=3; var TAGS=4; var SCORE=5; var FRTAGS=6; var FRTEXT=7; var cln3 = fun(s) s = ExpandCharEntities(Str_Trim(s)); var GOODCHARS = "abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ-'1234567890"; var i = 0; var r = ""; while i < Size(s) do var c = Select(s, i, i+1); if (Str_IndexOf(c, GOODCHARS) >= 0) then r = r + c end; i = i + 1 end; return r end; var cln2 = fun(s) s = ExpandCharEntities(Str_Trim(s)); s = Wub_ReplaceAll(s, "/>", ""); s = Wub_ReplaceAll(s, "\t", " "); s = Wub_ReplaceAll(s, "\\", ""); s = Wub_ReplaceAll(s, `"`, "'"); s = Wub_ReplaceAll(s, "???", ""); s = Wub_ReplaceAll(s, "??", ""); // s = Wub_ReplaceAll(s, ",", " "); return s; end; var cln = fun(s) return cln2(s); end; var trs = Str_Split(Markup(P), "\n"); every tr in trs do var dir = tr.dir ? nil; var tds = Str_Split(tr, "\t"); if Size(tds) > 5 then // (dir == "ltr") then var fImg = cln(tds[IMG]) ? ""; var fPageUrl = cln(tds[URL]) ? ""; var fCaption = cln(tds[CAPTION]) ? ""; var fAuthor = cln(tds[AUTHOR]) ? ""; var fTags = cln(tds[TAGS]) ? ""; var fScore = cln(tds[SCORE]) ? ""; var fCaptionFr = cln(tds[FRTEXT]) ? ""; var fTagsFr = cln(tds[FRTAGS]) ? ""; // if (Size(tds) == TAGS+1) or (Size(tds) == SCORE+1) or (Size(tds) == SCORE+2) then if (fImg != "") and (fPageUrl != "") and (fCaption != "") and (fPageUrl != "SiteUrl") then var s = fPageUrl; var img = fImg; if (s != "") and (img != "") and (s != "SiteUrl") then var doc = Wub_NewLuceneDocument(); var isNewYorker = "no"; if (Str_IndexOf("imgc.allpostersimages.com", fImg) > 0) then isNewYorker = "yes" end; var syns = ""; every cat in Str_Split(fTags, ",") do cat = Str_ToLowerCase(cln3(cat)); if cat member synonyms then syns = syns + synonyms[cat] end end; var allFields = fTags + " " + fTags + " " + fTags + " " + fCaption + " " + fAuthor + " " + cln3(syns); allFields = Str_Trim(allFields); var allFieldsFr = fTagsFr + " " + fTagsFr + " " + fTagsFr + " " + fCaptionFr + " " + fAuthor + " " + cln3(syns); allFieldsFr = Str_Trim(allFieldsFr); // Add fields to doc ok = (doc != nil) and Wub_AddLuceneField(doc, "pageUrl", fPageUrl, true, false, 0); ok = Wub_AddLuceneField(doc, "imgUrl", fImg, true, false, 0); if fCaption != "" then ok = Wub_AddLuceneField(doc, "text", fCaption, true, true, 0); end; if fCaptionFr != "" then ok = Wub_AddLuceneField(doc, "textfr", fCaptionFr, true, true, 0); end; if fAuthor != "" then ok = Wub_AddLuceneField(doc, "author", fAuthor, true, true, 0); end; if fTags != "" then ok = Wub_AddLuceneField(doc, "tags", fTags, true, true, 0); end; if fTagsFr != "" then ok = Wub_AddLuceneField(doc, "tagsfr", fTagsFr, true, true, 0); end; if fScore != "" then ok = Wub_AddLuceneField(doc, "handScore", fScore, true, true, 0); end; ok = Wub_AddLuceneField(doc, "search", allFields, false, true, 0); ok = Wub_AddLuceneField(doc, "searchFr", allFieldsFr, false, true, 0); ok = Wub_AddLuceneField(doc, "newYorker", isNewYorker, true, true, 0); var shouldAdd = true; // if (Size(tds) == SCORE+1) or (Size(tds) == SCORE+2) then if (fScore != "") then var val = fScore; if (val != "") then var f = (ToReal(val) ? 2.0); // Score = -1 has special meaning: don't add if (f == -1.0) then shouldAdd = false end; if (f > 1.0) then if (Str_IndexOf("gcoat", Str_ToLowerCase(fTags)) >= 0) then f = f + 90.0 elsif (Str_IndexOf("fcoat", Str_ToLowerCase(fTags)) >= 0) then f = f + 70.0 elsif (Str_IndexOf("ecoat", Str_ToLowerCase(fTags)) >= 0) then f = f + 50.0 end; /* var prv = scoreSet[val] ? nil; if (prv == nil) then scoreSet[val] := [cln(tds[TEXT])] else scoreSet[val] := scoreSet[val] + [cln(tds[TEXT])] end; */ // Will use post ranking by handScore instead doc.setBoost(f) end; end end; ok = (doc != nil) and shouldAdd and addDebug and Wub_AddLuceneDocument(writer, doc); if shouldAdd and addDebug then every tag in Str_Split(fTags, ",") do var tagName = Str_Trim(cln3(tag)); var num = tagSet[ tagName ] ? 0; tagSet[ tagName ] := num + 1 end; every tag in Str_Split(fTagsFr, ",") do var tagName = Str_Trim(cln3(tag)); var num = tagSetFr[ tagName ] ? 0; tagSetFr[ tagName ] := num + 1 end; var author = fAuthor; if (author != "") then var num = authorSet[ author ] ? 0; authorSet[ author ] := num + 1 end; numDocs = numDocs + 1; // if numDocs > 40 then // addDebug = false // end end end end end end; ok = ok and Wub_OptimizeLucene(writer); if (writer != nil) then Wub_CloseLuceneIndex(writer); end; settings.tagSet := tagSet; settings.tagSetFr := tagSetFr; settings.authorSet := authorSet; fi.exec := ToString(settings); Wub_SaveFunctionInfo(fi); WubCall("adminConsole", ["Index created on STAGING with " + ToString(numDocs) + " entries!"]);