Using Groovy to crawl Wiki and extract Country data

This script has taken a while longer than anticipated. Crawling Wiki from this base page proved to be a bit of a headache, because of inconsistencies in the way web pages were constructed.
Needless to say, I eventually came up with a script to help seed my project and it will provide data for some interesting blog posts in the future.
One of the things I wanted to do was save the flags, so if I present a list of blogs in the future, I can place the appropriate flag alongside it in a YUI or Dojo table.
I also wanted to extract country telephone calling codes, ready for more geo-coding steps ahead.
A few  links I found useful when creating this script are:

Here’s the script for crawlWikiCountries.groovy

package jgf
import groovy.grape.Grape
import com.thoughtworks.selenium.*
import javax.imageio.ImageIO
import java.awt.image.BufferedImage
@Grapes([
    @Grab(group='net.sourceforge.nekohtml', module='nekohtml', version='1.9.14'),
    @Grab(group='xerces', module='xercesImpl', version='2.9.1'),
    @Grab(group='org.seleniumhq.selenium.client-drivers', module='selenium-java-client-driver', version='1.0.1') ])

class CrawlWikiCountries extends GroovySeleneseTestCase {

  @Override
  void setUp() throws Exception {
    super.setUp('http://en.wikipedia.org', '*chrome')
    setDefaultTimeout(50000)
    setCaptureScreenshotOnFailure(false)
    return null
  }

  void testCrawlWikiCountries() throws Exception {
    def crawl = init()
    extractCountries(crawl)
    extractCountryDetails(crawl)
    dumpResultsToDisk(crawl)
    reportErrors(crawl)
    return null
  }

  def init() {
    def crawl = [:]
    crawl.with {
      h                               = System.getenv('HOME')                // OS Shell var
      fs                              = System.getProperty('file.separator') // Java Sys Property
      nl                              = System.getProperty("line.separator") // Newline character
      d                               = "${h}${fs}Desktop"
      gsd                             = "${d}${fs}Groovy Scripts"
      def props                       = new ConfigSlurper().parse(new File("${gsd}${fs}crawl.properties").toURL())
      countriesWikiFile               = "${d}${props.countriesFileWiki}"
      countriesSmallFlagImageDir      = "${d}${props.countriesSmallFlagImageDir}"
      countriesMediumFlagImageDir     = "${d}${props.countriesMediumFlagImageDir}"
      countriesLargeFlagImageDir      = "${d}${props.countriesLargeFlagImageDir}"
      countriesGlobalLocationImageDir = "${d}${props.countriesGlobalLocationImageDir}"
      countriesEmblemImageDir         = "${d}${props.countriesEmblemImageDir}"
      encoding                        = 'UTF-8'
      countries                       = []
      errors                          = []
    }
    println ''
    return crawl
  }

  def extractCountries(crawl) {
    selenium.open("http://en.wikipedia.org/wiki/ISO_3166-1")
    def page = getNekoHtml(crawl)
    def countryTable = page.depthFirst().TABLE.find{it.'@id' == 'sortable_table_id_0'}
    countryTable.TBODY.TR.eachWithIndex{tr, i ->
      if (i) { // Omit header row - zero -
               // (1..4).contains(i)
               // [1,2,9,30,45,130,232,233].contains(i)
               // [1,2,9,30,45,87,95,130,155,166,169,180,210,215,232,233,234,243,244].contains(i)
               // [87,95,155,166,169,180,210,215,243].contains(i)
               // [87,155,166,180,243].contains(i)
               /*
                *   1 Afghanistan                                    (tested)   omit
                *   2 Åland Islands                                  (tested)   omit
                *   9 Antarctica                            No vcard (tested)   omit
                *  30 Bouvet Island                                  (tested)   omit
                *  45 People's Republic of China                     (tested)   omit
                *  87 Guadeloupe                                     (tested)   omit
                *  95 Heard Island and McDonald Islands     No vcard (tested)   omit
                * 130 Republic of Macedonia                          (tested)   omit
                * 155 Netherlands                                    (tested)   omit
                  Awkward one this! Two tables with same class for vcard... Also Amsterdam/Hague autonumber...
                * 166 Oman                                           (tested)   omit
                  inconsistent naming on Flag href with embedded image...
                * 169 Palestinian territories               No vcard (tested)   omit
                * 180 Réunion                                        (tested)   omit
                  was having problems with location
                * 210 Svalbard and Jan Mayen                No vcard (tested)   omit.
                  Really two places.. Norwegian...
                * 215 Taiwan                                No vcard (tested)   omit
                * 232 United Kingdom                                 (tested)   omit
                * 233 United States                                  (tested)   omit
                * 234 United States Minor Outlying Islands  No vcard (tested)   omit
                * 243 Western Sahara                                 (tested)   omit
                * 244 Yemem                                          (tested)   omit
                  Was having problems with saving imag - had encoded URL for : to 3A caused issues. Fixed.
                */
        def td          = tr.depthFirst().findAll{it.name() == 'TD'}
        def a           = tr.depthFirst().findAll{it.name() == 'A'}
        def country     = [:]
        country.with {
          flag          = [:]
          flag.smallImg = td[0].depthFirst().SPAN.find{it.'@class' == 'flagicon'}.IMG.'@src'[0]
          countryUrl    = a[0].'@href' // - relative url /wiki
          name          = a[0].'@title'
          alpha2        = td[1].find{it.name() == 'TT'}.text()
          alpha3        = td[2].find{it.name() == 'TT'}.text()
          numeric       = td[3].find{it.name() == 'TT'}.text()
          iso3166_2     = a[1].'@title'
          iso3166Url    = a[1].'@href' // - relative url /wiki
          // Initialise values for subsequent page crawls
          citiesUrl     = ''
          emblem        = [:]
          loc           = [:]
          capital       = [:]
          ll            = []
          largestCity   = [:]
          callingCode   = [:]
          tld           = []
        }
        crawl.countries << country
      }
    }
    println "Total countries : ${crawl.countries.size()} on first page"
    println ''
    println '---'
    println ''
    return null
  }

  def extractCountryDetails(crawl) {
    crawl.countries.eachWithIndex{country, i ->
      crawl.country = country
      println country.name
      crawl.i       = i
      crawlCountryDetailsPage(crawl)
      crawl.countries[i] = crawl.country
    }
    return null
  }

  def crawlCountryDetailsPage(crawl) {
    selenium.open("http://en.wikipedia.org$crawl.country.countryUrl")
    crawl.with {
      page       = getNekoHtml(crawl)
      extractCoOrds(crawl)
      def geog_vcards = page.depthFirst().TABLE.findAll{it.'@class' == 'infobox geography vcard'}
      geog_vcard = null
      def gsz = geog_vcards.size()
      def pageFlagImg
      def pageFlagHist
      def location
      switch (gsz) {
        case 0  : // After first full crawl, also noticed issues with [9,95,169,210,234]
                  // Also has minor hiccups with [87,155,166,180,215,243]
                  println "*** No VCARD: $country.name"
                  switch (country.name) {
                    case 'Antarctica' : // #9
                      geog_vcard   = page.depthFirst().DIV.find{it.'@style'?.startsWith('float: right')}
                      pageFlagImg  = page.depthFirst().A.find{it.'@href'?.startsWith('/wiki/File:Flag')}
                      pageFlagHist = page.depthFirst().A.find{it.'@href'?.startsWith('/wiki/Flag_of')}
                      location     = page.depthFirst().A.find{it.'@href'?.startsWith('/wiki/File:Location')}
                      break
                    case 'Heard Island and McDonald Islands' : // #95
                      location     = page.depthFirst().A.find{it.'@href' == '/wiki/File:Kerguelen-Location.JPG'}
                      break
                    case 'Palestinian territories' : //  #169
                      location     = page.depthFirst().A.find{it.'@href'?.startsWith('/wiki/File:West_Bank') && it.'@class' == 'image' }
                      break
                    case 'Svalbard and Jan Mayen' : // #210
                      // Can't do much two places!...
                      break
                    case 'Taiwan' : // #215
                      location     = page.depthFirst().A.find{it.'@href'?.startsWith('/wiki/File:Location')}
                      def cap      = page.depthFirst().A.find{it.'@title' == 'Taipei'}
                      country.capital.name     = cap.text()
                      country.capital.url      = cap.'@href'
                      country.capital.latitude  = country.latitude
                      country.capital.longitude = country.longitude
                      country.largestCity.name = country.capital.name
                      country.largestCity.url  = country.capital.url
                      country.flag.smallImg    =  page.depthFirst().SPAN.find{it.'@class' == 'flagicon'}.IMG[0].'@src'

                      break
                    case 'United States Minor Outlying Islands' : // #234
                      geog_vcard   = page.depthFirst().DIV.find{it.'@class' == 'thumb tright'}
                      pageFlagImg  = geog_vcard.depthFirst().A.find{it.'@href'?.startsWith('/wiki/File:Flag')}
                      pageFlagHist = geog_vcard.depthFirst().A.find{it.'@href'?.startsWith('/wiki/Flag_of')}
                      location     = page.depthFirst().A.find{it.'@href'?.startsWith('/wiki/File:United') && it.'@class' == 'image' }
                      break
                  } // Switch country name
                  break
        case 1  : geog_vcard = geog_vcards[0]
                  break
        default : // Greater than 1..
                  println "*** Multple VCARDS ($gsz): $country.name"
                  def ind = -1
                  geog_vcards.eachWithIndex{g, i ->
                    if (g.depthFirst().any{g2 -> g2.text() == 'Capital'})
                      ind = i
                  }
                  if (ind != -1)
                    geog_vcard =  geog_vcards[ind]
                  break
      } // switch gsz
      if (pageFlagImg) {
        country.flag.mediumImg   = pageFlagImg.IMG[0].'@src'
        country.flag.largeImgUrl = pageFlagImg.'@href'  // Goes to bigger flag again...
      }
      if (pageFlagHist)
        country.flag.historyUrl  = pageFlagHist.'@href'
      if (location) {
        country.loc.img          = location.IMG[0].'@src'
        country.loc.largeImgUrl  = location.'@href'
      }

      if (geog_vcard) {
        processGeogVcard(crawl)
        if (country.flag.largeImgUrl) {
          processLargeFlag(crawl)
        }
      } else {
        println "*** No VCARD: $country.name - not caught!"
      }
    } // crawl.with
    return null
  }

  def extractCoOrds(crawl) {
    // Sometimes you have lat/long for country as a whole eg. Antartica #9, Bouvet Island #30, China #45, Macedonia #130
    crawl.with {
      def coordinatesSpan = page.depthFirst().SPAN.find{it.'@id' == 'coordinates'}
      def geo             = coordinatesSpan?.depthFirst()?.find{it.name() == 'SPAN' && it.'@class' == 'geo'}?.text()
      def ll              = (geo) ? geo.tokenize('; ') : ['','']
      country.latitude  = ll[0]
      country.longitude = ll[1]
    }
    return null
  }

  def processGeogVcard(crawl) {
    /* Types of row:
     *
     * Sometimes you have lat/long for country as a whole eg. Antartica #9, Bouvet Island #30, China #45, Macedonia #130
     *
     * NB: Antartica is odd one not in geog vcard... Processed elsewhere - no capital! #9
     *
     * Flag & Emblem/Coat of Arms (Anchor with Img inside) - US has a Seal instead of usual #233
     * Bouvet has no coat of arms.. #30
     *
     * Location (Anchor with Img inside & various other A's) - Can also have territories Img - don't care about this for now eg UK. #232
     *
     * Capital link, normally cap city next. Can have city coat of arms! See Macedonia with Skopje! #130
     * can have
     * (and Largest city) eg UK - points to List of Cities #232 (or Afghanistan #1)- but can be plain text like Åland Islands #2
     * Largest city text only like USA #233
     * Largest city as link points to List of Cities again, like UK.. #232
     * Largest city name & link to it - E.g. USA/China - New York/Shang Hai link to city #233/#45
     * Largest city feature with Bermuda #25 too. Largest city is text & is same as capital!
     * Also has lat/long of capital. Be careful not to pick one that's not in vcard...
     * Some countries eg. Macedonia als have country co-ords outside vcard. That one too has city coaf of arms link with img inside.. #130
     * Similar to Macedonia with country co-ords is Bouvet Island.. #30 (#30 also has no Capital row at all)
     *
     * Internet TLD link & code as either link or text.. many in case of US...
     *
     * Calling code link & code Gets awkward too. [1,2,232]
     *  Aland Island link & text to combine for code.. #2
     *  Afghanistan text #1
     *  UK link... #232
     * Test data rows [1,2,9,30,45,130,232,233].contains(i)
     * After first crawl:
     *  [9,95,169,210,234]
     *  [87,155,166,180,215,243,244] 244 had odd issue with map..
     * Combined test:
     * [1,2,9,30,45,87,95,130,155,166,169,180,210,215,232,233,234,243,244].contains(i)
     */
    def tr = crawl.geog_vcard.depthFirst().findAll{it.name() == 'TR'}
    //println tr.size()
    def lcity = false
    def capLargestCity = false
    crawl.country.with {
      tr.each{trow ->     //println trow
        def a = trow.depthFirst().findAll{it.name() == 'A' && it.'@class' != 'external autonumber'} // Mod for Amsterdam /Hague
        def flagf = false
        capLargestCity  = (capLargestCity) ?: [ trow.TD[0]?.A?.text() , trow.TD[0]?.text()].any{txt -> txt?.startsWith('(and')}
        def telf  = false
        def tldf  = false
        def allocatedLink = false
        a.eachWithIndex{a1, ai -> //println a1
          def ahref  = a1.'@href'
          def atitle = a1.'@title'
          def aclass = a1.'@class'
          def aimg    = (aclass) ? aclass == 'image' : false
          // Flag & Emblem
          if (!flag.mediumImg && aimg && atitle?.startsWith('Flag')) { //  ahref.startsWith('/wiki/File:Flag') - Oman breaks this bad naming had to re-work.
            flag.mediumImg   = a1.IMG[0].'@src'
            flag.largeImgUrl = ahref  // Goes to bigger flag again...
            flagf         = true
            allocatedLink = true
          } else if (!flag.historyUrl && atitle?.startsWith('Flag') && ahref.startsWith('/wiki/Flag_of')) {
            flag.historyUrl = ahref // Shows historical versions
            flagf         = true
            allocatedLink = true
          } else {
            if (flagf) {
              if (ahref.endsWith('.svg') && !emblem.img) {
                emblem.img         = a1.IMG[0].'@src'
                emblem.largeImgUrl = ahref // goes to bigger
                allocatedLink      = true
              } else if (!emblem.historyUrl) {
                emblem.historyUrl  = ahref // goes to history
                allocatedLink      = true
              }
            }
          }
          // Location
          if (!loc.img && !allocatedLink) {
            if (aimg && atitle?.startsWith('Location') &&
                 (   ahref.indexOf('orthographic_projection') != -1 // eg. Afghanistan
                  || ahref.startsWith('/wiki/File:Location')        // eg. Reunion
                  || ahref.startsWith('/wiki/File:Map')             // e.g.Oman
                  || ahref.startsWith('/wiki/File:EU')              // e.g. UK
                 ) &&  ahref.indexOf('BOTs') == -1                  // Tweak for UK...
               ) {
              loc.img         = a1.IMG[0].'@src'
              loc.largeImgUrl = ahref
              allocatedLink   = true
            }
          }
          // Largest City / Cities
          if (!citiesUrl && ahref.startsWith('/wiki/List_of_cities')) { // UK is different in capital row. Others like US/China outside.
            citiesUrl = ahref
            if (a1.text() == 'Largest city') {
              if (a[ai +1]) {
                largestCity.name = a[ai +1].text()
                largestCity.url  = a[ai +1].'@href'
              } else { // Cater for the likes of quirky Bermuda Wiki page
                largestCity.name = trow.TD[0].text()
                largestCity.url  = ''
              }
            }
          }
          // Capital
          if (!capf && !allocatedLink && ahref.startsWith('/wiki/Capital')) {
            capf = true
          } else if (capf) {
            if (ahref.startsWith('http://toolserver.org')) {
              if (ll.size() == 0) {
                def geo = a1.depthFirst().find{it.name() == 'SPAN' && it.'@class' == 'geo'}?.text()
                ll  = (geo) ? geo.tokenize('; ') : ['','']
                capital.latitude  = ll[0]
                capital.longitude = ll[1]
              }
              if (!capital.name) {
                capital.name = a[ai -1].text()
                capital.url  = a[ai -1].'@href'
                if (capLargestCity) {
                  largestCity.name = capital.name
                  largestCity.url  = capital.url
                }
              }
            } else if (ahref.startsWith('/wiki/File:')) {
              capital.emblemImg = a1.IMG[0].'@src'
              capital.emblemUrl = ahref
            }
          }
          // Calling code
          if (!telf && !callingCode.text && !allocatedLink && ahref.startsWith('/wiki/List_of_country_calling_codes')) {
             telf = true
          }
          if (telf && !callingCode.text) {
            def child1      = trow.children()[1]
            def child10     = (child1) ? child1?.children()[0] : null
            def child10Text = (!child10 || child10.class.simpleName  == 'String') ? null : child10?.text()
            if (child10Text) {
              callingCode.text = "$child10Text ${child1?.text()}".trim()
              callingCode.url  = child10.'@href'
            } else if (child1) {
              callingCode.text = child1.text()
              if (child1.'@href') {
                callingCode.url  = child1.'@href'
              }
            }
          }
          // Tld
          if (!allocatedLink && !tldf && !tld && ahref.startsWith('/wiki/Country_code_top-level_domain')) {
            tldf = true
          } else if (tldf) {
            if (ahref.startsWith('/wiki/.')) {
              def t = [:]
              t.text = a1.text()
              t.url = ahref
              tld << t
            }
          }
        } // a each
        //println '---'
      } // tr each
    } // crawl country with'
    return null
  }

  def processLargeFlag(crawl) {
    selenium.open("http://en.wikipedia.org$crawl.country.flag.largeImgUrl" as String)
    def flagPage = getNekoHtml(crawl)
    def fileDiv = flagPage.depthFirst().DIV.find{it.'@id' == 'file'}
    crawl.country.flag.largeImg = fileDiv.A[0].IMG[0].'@src'  // A has an IMG inside
    return null
  }

  def getNekoHtml(crawl) {
    def parser = new org.cyberneko.html.parsers.SAXParser()
    parser.setFeature('http://xml.org/sax/features/namespaces', false)
    def nekoHtml = new XmlParser(parser).parseText(selenium.getHtmlSource())
    return nekoHtml
  }

  def dumpResultsToDisk(crawl) {
    crawl.with {
      def sz = countries.size()
      println ''
      println '==='
      println "Writing $sz countries to disk: $countriesWikiFile"
      def xmlFile = new File(countriesWikiFile)
      xmlFile.write("<countries>$nl", encoding)
      countries.eachWithIndex{country, i ->
        if (i.mod(20) == 0)
          println "Writing ${i+1} of $sz record(s)"
          println country.name
        def xml  = "  <country id='${i+1}'>$nl"
            xml += "    <name>$country.name</name>$nl"
            xml += "    <alpha2>$country.alpha2</alpha2>$nl"
            xml += "    <alpha3>$country.alpha3</alpha3>$nl"
            xml += "    <iso-number>$country.numeric</iso-number>$nl"
            xml += "    <iso3166-2>$country.iso3166_2</iso3166-2>$nl"
            xml += "    <lat-long>$nl"
            xml += "      <latitude>$country.latitude</latitude>$nl"
            xml += "      <longitude>$country.longitude</longitude>$nl"
            xml += "    </lat-long>$nl"
            xml += "    <location>$nl"
            xml += "      <image>${(country.loc.img)?:''}</image>$nl"
            xml += "      <large-image-url>${(country.loc.largeImgUrl)?:''}</large-image-url>$nl"
            xml += "    </location>$nl"
            xml += "    <capital>$nl"
            xml += "      <name>${(country.capital.name)?:''}</name>$nl"
            xml += "      <lat-long>$nl"
            xml += "        <latitude>${(country.capital.latitude)?:''}</latitude>$nl"
            xml += "        <longitude>${(country.capital.longitude)?:''}</longitude>$nl"
            xml += "      </lat-long>$nl"
            xml += "      <url>${(country.capital.url)?:''}</url>$nl"
            xml += "      <emblem-img>${(country.capital.emblemImg)?:''}</emblem-img>$nl"
            xml += "      <emblem-url>${(country.capital.emblemUrl)?:''}</emblem-url>$nl"
            xml += "    </capital>$nl"
            xml += "    <largest-city>$nl"
            xml += "      <name>${(country.largestCity.name)?:''}<name>$nl"
            xml += "      <url>${(country.largestCity.url)?:''}</url>$nl"
            xml += "    </largest-city>$nl"
            xml += "    <urls>$nl"
            xml += "      <country-url>$country.countryUrl</country-url>$nl"
            xml += "      <iso3166-url>$country.iso3166Url</iso3166-url>$nl"
            xml += "      <cities-url>$country.citiesUrl</cities-url>$nl"
            xml += "    </urls>$nl"
            xml += "    <flags>$nl"
            xml += "      <images>$nl"
            xml += "        <small>${(country.flag.smallImg)?:''}</small>$nl"
            xml += "        <medium>${(country.flag.mediumImg)?:''}</medium>$nl"
            xml += "        <large>${(country.flag.largeImg)?:''}</large>$nl"
            xml += "      <images>$nl"
            xml += "      <large-img-url>${(country.flag.largeImgUrl)?:''}</large-img-url>$nl"
            xml += "      <history-url>${(country.flag.historyUrl)?:''}</history-url>$nl"
            xml += "    </flags>$nl"
            xml += "    <emblem>$nl"
            xml += "      <image>${(country.emblem.img)?:''}</image>$nl"
            xml += "      <large-image-url>${(country.emblem.largeImgUrl)?:''}</large-image-url>$nl"
            xml += "      <history-url>${(country.emblem.historyUrl)?:''}</history-url>$nl"
            xml += "    </emblem>$nl"
            xml += "    <callingCode>$nl"
            xml += "      <value>${(country.callingCode.text)?:''}</value>$nl"
            xml += "      <url>${(country.callingCode.url)?:''}</url>$nl"
            xml += "    </callingCode>$nl"
            xml += "    <tlds>$nl"
        country.tld.each{t ->
            xml += "      <tld>$nl"
            xml += "        <text>$t.text</text>$nl"
            xml += "        <url>$t.url</url>$nl"
            xml += "      </tld>$nl"
        }
            xml += "    </tlds>$nl"
            xml += "  </country>$nl"
        xmlFile.append(xml, encoding)
        if (xml.contains('null')) {
          println "Null in: $country.name"
          errors << country.name
        }
        crawl.country = country
        writeImagesToDisk(crawl)
      }
      xmlFile.append("</countries>$nl", encoding)
    }
    return null
  }

  def writeImagesToDisk(crawl) {
    crawl.with {
      def cenc    = URLEncoder.encode("${country.name}.png")
      def f1 = [:]
      def f2 = [:]
      def f3 = [:]
      def f4 = [:]
      def f5 = [:]

      f1.urlStr   = country.flag.smallImg
      f1.filename = "$countriesSmallFlagImageDir$fs$cenc"
      f2.urlStr   = country.flag.mediumImg
      f2.filename = "$countriesMediumFlagImageDir$fs$cenc"
      f3.urlStr   = country.flag.largeImg
      f3.filename = "$countriesLargeFlagImageDir$fs$cenc"
      f4.urlStr   = country.loc.img
      f4.filename = "$countriesGlobalLocationImageDir$fs$cenc"
      f5.urlStr   = country.emblem.img
      f5.filename = "$countriesEmblemImageDir$fs$cenc"

      def f = [f1, f2,f3,f4,f5]
      f.each{img -> if (img.urlStr) {
                      def url = img.urlStr.toURL()
                      def fi  = new File(img.filename as String)
                      writeImageToDisk(url, fi)
                    }
      }
    }
    return null
  }

  def writeImageToDisk(URL u, File f) {
    def img = null
    try {
      img = ImageIO.read(u)
      ImageIO.write(img, "png", f)
    } catch (IOException e) {
      println "$e | $u"
    }
    return null
  }

  def reportErrors(crawl) {
    println ''
    println '==='
    crawl.with {
      if (errors) {
        println 'Null data in'
        errors.each{println it}
      } else {
        println 'No null data in XML'
      }
    }
    return null
  }

}

Here’s my configuration properties entries that ConfigSlurper uses:

crawl.properties (for crawlWikiCountries.groovy)

Here’s the output

CountriesWiki xml UK sample extract

Images folders collapsed

ImagesFolder Expanded flags

Here’s snippets from the output created during the crawl:

Summary of salient events during stages of crawl from start to finish

Related posts:

Advertisements

About this entry