Collaborama About Help Contact Anonymous [login] Source: site.view [edit] Function name: scrapeCartoons Arguments: Description: Construct a sample dataset of cartoons by scraping CondenastStore. Try to guess categories by using keyword spotting. Page type: webl Render function: Module: perfectCartoon Page source: var GetAuthor = fun(P, img, nextImg) var A; if (nextImg != nil) then A = Elem(P, "a") after img before nextImg else A = Elem(P, "a") after img end; var s = ""; every a in A do var c = a.class ? nil; if (Str_IndexOf("artistName", c) > 0) then return Str_Trim(Text(a)) end; s = s + Markup(a) end; return "NoAuthor" end; var res = []; var pageNum = 1; var tsv = ""; while pageNum < 30 do var P = GetURL("http://www.condenaststore.com/gallery.asp?cat=146230&c=c&title=Cartoons-by-Artist-Prints&cid=02EC0207D6FA432F9D56CDB3000B4C5E&isAjax=true&_=1371317483994&startat=/GetThumb.asp&Search=146230&page=" + ToString(pageNum)); var Imgs = Elem(P, "img"); var i = 0; while (i < Size(Imgs)-1) do var img = Imgs[i]; var nextImg = nil; if (i + 6 < Size(Imgs)) then nextImg = Imgs[i+6] end; var c = img.class ? nil; if (c == "thmbd") then var a = Parent(img); var pgUrl = a.href; var label = ExpandCharEntities(img.alt); res = res + [ [. imgUrl=img.src, pageUrl=pgUrl, title=label, text=label .] ]; var tags = WubCall("generateCategories", label); var author = GetAuthor(P, img, nextImg); tsv = tsv + pgUrl + "\t" + img.src + "\t" + label + "\t" + label + "\t" + author + "\t" + tags + "\n"; end; i = i + 1 end; pageNum = pageNum + 1 end; var fi = Wub_GetFunctionInfo("perfectCartoon.scrapedCartoons"); fi.exec = tsv; Wub_SaveFunctionInfo(fi); Size(res);