Collaborama About Help Contact Anonymous [login] Source: site.view [edit] Function name: scrapeRobsCategories Arguments: Description: Extract all the Perfect Cartoons on Rob's Google Doc pages Page type: webl Render function: Module: perfectCartoon Page source: var cartoons = [. .]; var urls = ["https://docs.google.com/document/d/1MCr7ZIj03ae3Jq3D0CixGTfYPv7mpsFVUrxqVLG2iBQ/pub"]; var getBlissAttributes = fun(P, info) var txt = nil; every p in Elem(P, "p") do var c = p.class ? nil; if (c == "product-image") then var Img = Elem(P, "img") inside p; if Size(Img) == 1 then txt = Str_Trim(ExpandCharEntities(Img[0].alt)); info.text := txt; var tags = WubCall("generateCategories", [txt]); if tags != [] then info.cats := ToList(ToSet(info.cats + tags)) end; info.imgUrl := Img[0].src; info.imgHeight := 350; info.imgWidth := 350; end; info.author := "Harry Bliss"; return info end end; return info end; var getNastAttributes = fun(P, info) var txt = nil; every d in Elem(P, "div") do var c = d.id ? nil; if (c == "productHeading") then var H1 = Elem(P, "h1") inside d; if Size(H1) == 1 then txt = Str_Trim(ExpandCharEntities(Text(H1[0]))); end; every span in (Elem(P, "span") inside d) do var c = span.id ? nil; if (c == "artistName") then info.author := Str_Trim(Text(span)) end end elsif (c == "productImage") then var Img = Elem(P, "img") inside d; if Size(Img) == 1 then info.imgUrl := Img[0].src; info.imgHeight := Img[0].height ? nil; info.imgWidth := Img[0].width ? nil; end; end end; if (txt != nil) then info.text := txt; var tags = WubCall("generateCategories", [txt]); if tags != [] then info.cats := ToList(ToSet(info.cats + tags)) end end; return info end; var extractBitly = fun(u) u = Url_Decode(u); var i = Str_IndexOf("q=http://bit.ly", u); if (i >= 0) then u = Select(u, i+2, Size(u)); i = Str_IndexOf(`&`, u); if (i > 0) then u = Select(u, 0, i) end end; return u; end; var getExtendedAttributes = fun(url, info) var P = GetURL(url) ? nil; if (P != nil) then if Str_IndexOf("Nast Collection", Markup(P)) > 0 then return getNastAttributes(P, info) elsif Str_IndexOf("harrybliss.com", Markup(P)) > 0 then return getBlissAttributes(P, info) end end; return info end; var processCategory = fun(chapterCats, url) var P = GetURL(url); every img in Elem(P, "img") do var cats=chapterCats; // if Size(ToList(cartoons)) > 20 then // return // end; var A = Elem(P, "a") after img; var a = A[0].href ? nil; if (a != nil) then a = extractBitly(a); var info = cartoons[a] ? [. imgUrl = nil, cats = [], text = nil, author = nil .]; var oldCats = info.cats; cats = ToList(ToSet(oldCats + cats)); info.cats := cats; // Don't get extended attributes twice. Just add categories above. if (info.imgUrl == nil) then cartoons[a] := getExtendedAttributes(a, info) else cartoons[a] := info end end end; end; var processCategories = fun(P) every a in Elem(P, "a") do var s = Str_Trim(Text(a)); // if Size(ToList(cartoons)) > 20 then // return // end; var i = Str_IndexOf("-Pub", s); if (i > 0) then var cats = Str_Split(ExpandCharEntities(Select(s, 0, i)), ",&/-"); processCategory(cats, a.href); end end end; var c = fun(s) if s == nil then return "" else return Str_Trim(Wub_ReplaceAll(ToString(s), "\t", " ")) end end; var cats = fun(lst) var s = ""; every cat in lst do s = s + Str_Trim(ToString(cat)) + ", " end; if s != "" then s = Select(s, 0, Size(s) - 2) end; return Str_Trim(s) end; var main = fun() every catUrl in urls do var P = GetURL(catUrl); processCategories(P) end end; main(); Wub_DeleteData("cartooninfo"); every url in cartoons do var data = c(url) + "\t" + c(cartoons[url].imgUrl) + "\t" + c(cartoons[url].text) + "\t" + c(cartoons[url].text) + "\t" + c(cartoons[url].author) + "\t" + cats(cartoons[url].cats) + "\t2.0\n"; Wub_AppendData("cartooninfo", data); end; Wub_ReadData("cartooninfo");