Groovy script to extract Country calling code information from Wiki page

This time I took the List of country calling codes page off Wiki and extracted data from it.

A couple of extra entries went into my usual crawl.properties file

crawl.properties

Here’s the script.

In an earlier simplified post I touched upon how I was able to use a closure to refactor code.

That manifested itself in a couple of places in this script.

  • Lines 248-271 with ‘getNodeAndChildNodeInfo’
  • Lines 642-656 with ‘processImageForCountry’ (having the closest correlation to the earlier post).

Because I couldn’t find a NextSibling option, there’s some quirky DOM navigation going on here to extract the In Depth Zones 1-9 entries.

Basically I use the id from the zones data structure (lines 46-135), to navigate to a SPAN with this class, then go back to parent and back down again to children after it.

To keep the code simple I only extracted data if there was a dialing code link, a flag and a country link on the line.

So if there were two flags, like for code +290.

In Depth snippet

Wiki gives all links (A tags) without a proper connecting page a class of “new”. I ignore the link in these cases, as it’s normally an edit page for someone to start the ball rolling.

There’s also some discrepancies between different parts of the pages in names of countries, hence the use of countryAliases on lines  137-191 to normalise country codes.

I basically look for a combination of calling code and country name to make sure you don’t add a duplicate entry.

The urls list within each zone map entry are useful grouping mechanisms to geographically collate countries in.

That’s about it. Enjoy.

PS: If there’s any way I can improve my code, comments welcome as always.

Here’s the code for crawlWikiCallingCodes.groovy.

package jgf
import groovy.grape.Grape
import com.thoughtworks.selenium.*
import javax.imageio.ImageIO
import java.awt.image.BufferedImage
@Grapes([
    @Grab(group='net.sourceforge.nekohtml', module='nekohtml', version='1.9.14'),
    @Grab(group='xerces', module='xercesImpl', version='2.9.1'),
    @Grab(group='org.seleniumhq.selenium.client-drivers', module='selenium-java-client-driver', version='1.0.1') ])

class CrawlWikiCallingCodes extends GroovySeleneseTestCase {

  @Override
  void setUp() throws Exception {
    super.setUp('http://en.wikipedia.org', '*chrome')
    setDefaultTimeout(50000)
    setCaptureScreenshotOnFailure(false)
    return null
  }

  void testCrawlWikiCallingCodes() throws Exception {
    def crawl = init()
    extractAtAGlance(crawl)
    extractZones1to9(crawl)
    extractBases(crawl)
    extractCompleteListing(crawl)
    printResults(crawl)
    dumpResultsToDisk(crawl)
    reportErrors(crawl)
    writeImagesToDisk(crawl)
    return null
  }

  def init() {
    selenium.open("http://en.wikipedia.org/wiki/List_of_country_calling_codes")
    def crawl = [:]
    crawl.with {
      page                            = getNekoHtml(crawl)
      h                               = System.getenv('HOME')                // OS Shell var
      fs                              = System.getProperty('file.separator') // Java Sys Property
      nl                              = System.getProperty("line.separator") // Newline character
      d                               = "${h}${fs}Desktop"
      gsd                             = "${d}${fs}Groovy Scripts"
      def props                       = new ConfigSlurper().parse(new File("${gsd}${fs}crawl.properties").toURL())
      encoding                        = 'UTF-8'
      zones                           = [[zone:  '1',
                                          id:    'Zone_1_.E2.80.93_North_American_Numbering_Plan_Area',
                                          name:  'North American Numbering Plan Area',
                                          urls: [[name:'List of North American Numbering Plan area codes',
                                                  link:'http://en.wikipedia.org/wiki/List_of_North_American_Numbering_Plan_area_codes'],
                                                 [name:'NANPA',
                                                  link:'http://en.wikipedia.org/wiki/North_American_Numbering_Plan'],
                                                 [name: 'Caribbean nations',
                                                  link:'http://en.wikipedia.org/wiki/Caribbean_nation']
                                                ]
                                         ],
                                         [zone:  '2',
                                          id:    'Zone_2_.E2.80.93_Mostly_Africa',
                                          name:  'Mostly Africa',
                                          urls: [[name:'Africa',
                                                  link:'http://en.wikipedia.org/wiki/Africa']
                                                ]
                                         ],
                                         [zone:  '3',
                                          id:    'Zones_3.2F4_.E2.80.93_Europe',
                                          name:  'Europe',
                                          urls: [[name:'Europe',
                                                  link:'http://en.wikipedia.org/wiki/Europe']
                                                ]
                                         ],
                                         [zone:  '4',
                                          id:    null,
                                          name:  'Europe',
                                          urls: [[name:'Europe',
                                                  link:'http://en.wikipedia.org/wiki/Europe']
                                                ]
                                         ],
                                         [zone:  '5',
                                          id:    'Zone_5_.E2.80.93_Mostly_Latin_America',
                                          name:  'Mostly Latin America',
                                          urls: [[name:'Latin America',
                                                  link:'http://en.wikipedia.org/wiki/Latin_America'],
                                                 [name:'South America',
                                                  link:'http://en.wikipedia.org/wiki/South_America']
                                                ]
                                         ],
                                         [zone:  '6',
                                          id:    'Zone_6_.E2.80.93_Southeast_Asia_and_Oceania',
                                          name:  'Southeast Asia and Oceania',
                                          urls: [[name:' Southeast Asia ',
                                                  link:'http://en.wikipedia.org/wiki/Southeast_Asia'],
                                                 [name:'Oceania',
                                                  link:'http://en.wikipedia.org/wiki/Oceania']
                                                ]
                                         ],
                                         [zone:  '7',
                                          id:    'Zone_7_.E2.80.93_Seventh_World_Numbering_Zone_.28former_Soviet_Union.29',
                                          name:  'Seventh World Numbering Zone (former Soviet Union)',
                                          urls: [[name:'Soviet Union',
                                                  link:'http://en.wikipedia.org/wiki/Soviet_Union']
                                                ]
                                         ],
                                         [zone:  '8',
                                          id:    'Zone_8_.E2.80.93_East_Asia_and_Special_Services',
                                          name:  'East Asia and Special Services',
                                          urls: [[name:'East Asia ',
                                                  link:'http://en.wikipedia.org/wiki/East_Asia']
                                                ]
                                         ],
                                         [zone:  '9',
                                          id:    'Zone_9_.E2.80.93_Central.2C_South_and_Western_Asia',
                                          name:  'Central, South  and Western Asia',
                                          urls: [[name:'Central Asia',
                                                  link:'http://en.wikipedia.org/wiki/Central_Asia'],
                                                 [name:'South Asia',
                                                  link:'http://en.wikipedia.org/wiki/South_Asia'],
                                                 [name:'Western Asia',
                                                  link:'http://en.wikipedia.org/wiki/Western_Asia']
                                                ]
                                         ],
                                         [zone:  '0',
                                          id:    'Zone_0_.E2.80.93_unassigned',
                                          name:  'Unassigned',
                                          urls: []
                                         ],
                                         [zone:  'b',
                                          id:    'Locations_with_no_country_code',
                                          name:  'Unassigned',
                                          urls: [[name:'Telecommunications in Antarctica',
                                                  link:'http://en.wikipedia.org/wiki/Telecommunications_in_Antarctica'],
                                                 [name:'Antarctica',
                                                  link:'http://en.wikipedia.org/wiki/Antarctica']
                                                ]
                                         ]
                                        ]
      errors                          = []
      countryAliases                  = [[from:'The Bahamas',
                                          to:'Bahamas'],
                                         [from:'U.S. Virgin Islands',
                                          to:'United States Virgin Islands'],
                                         [from:'St. Lucia',
                                          to:'Saint Lucia'],
                                         [from:'St. Vincent and the Grenadines',
                                          to:'Saint Vincent and the Grenadines'],
                                         [from:'Nevis',
                                          to:'Saint Kitts and Nevis'],
                                         [from:'The Gambia',
                                          to:'Gambia'],
                                         [from:"Côte d'Ivoire",
                                          to:'Ivory Coast'],
                                         [from:'Republic of the Congo',
                                          to:'Republic of Congo'],
                                         [from:'Democratic Republic of the Congo',
                                          to:'Congo, Dem. Rep. of (Zaire)'],
                                         [from:'Ascension',
                                          to:'Ascension Island'],
                                         [from:'Republic of Moldova',
                                          to:'Moldova'],
                                         [from:'Ireland',
                                          to:'Republic of Ireland'],
                                         [from:'Vatican',
                                          to:'Vatican City'],
                                         [from:'Republic of Macedonia',
                                          to:'Macedonia'],
                                         [from:'St. Pierre and Miquelon',
                                          to:'Saint Pierre and Miquelon'],
                                         [from:'Saint-Pierre and Miquelon',
                                          to:'Saint Pierre and Miquelon'],
                                         [from:'Cuba (Guantanamo Bay)',
                                          to:'Guantanamo Bay'],
                                         [from:'Cocos Islands',
                                          to:'Cocos-Keeling Islands'],
                                         [from:'Brunei Darussalam',
                                          to:'Brunei'],
                                         [from:'Federated States of Micronesia',
                                          to:'Micronesia'],
                                         [from:'Hong Kong SAR China',
                                          to:'Hong Kong'],
                                         [from:'Macau SAR China',
                                          to:'Macau'],
                                         [from:'Mainland China',
                                          to:'China'],
                                         [from:"People's Republic of China",
                                          to:'China'],
                                         [from:'Republic of China',
                                          to:'China'],
                                         [from:'Palestinian Authority',
                                          to:'Palestinian Territory'],
                                         [from:'Georgia (country)',
                                          to:'Georgia']
                                        ]
      countriesCallingCodesWiki       = "${d}${props.countriesCallingCodesWiki}"
      countriesCallingCodesFlagDir    = "${d}${props.countriesCallingCodesFlagDir}"
    }
    initCodesAndBases(crawl)
    println ''
    return crawl
  }

  def initCodesAndBases(crawl) {
    def z
    ('1'..'9').each{
      z    = getZone(crawl, it)
      z.codes  = []
    }
    z = getZone(crawl, 'b')
    z.bases = []
  }

  def getAliasedCountryName(crawl, name) {
    def res = name
    crawl.countryAliases.each{if (it.from == name) res = it.to}
    return res
  }

  def extractAtAGlance(crawl) {
    def nc = [:]
    crawl.with {
      ataglance = page.depthFirst().TABLE.findAll{it.'@class' == 'wikitable'}[0]
      ataglance.TBODY.TR.eachWithIndex{tr, tri ->
        def z
        if (tri) { // ignore first row..
          z      = getZone(crawl, (tri) as String)
        }
        def tds  = tr.depthFirst().findAll{it.name() == 'TD'}
        tds.each{td ->
          td.each{tdc ->
            getNodeAndChildInfo(tdc, nc)
            if (nc.nnb) { // Already processed codeCode/codeUrl..
            } else if (nc.nn == 'P') {
              tdc.each{pc ->
                getNodeAndChildInfo(pc, nc)
                if (nc.nnb) { // Already processed codeCode/codeUrl..
                } else if (nc.nnaCode) {
                  z.codes << getAAGCode(crawl, nc)
                }
              } // tdc each
            } else if (nc.nnaCode) {
              z.codes << getAAGCode(crawl, nc)
            }
          } // td each
        } // tds each
      } // ataglance ... TR
    } // crawl with
    return null
  }

  def getNodeAndChildInfo = {node, nc ->
    nc.node = node
    nc.with {
      //node         = node
      nodeIsString = node.class.simpleName == 'String'
      nn           = (nodeIsString) ? null : node.name()
      t            = (nodeIsString) ? node : node.text()
      ncn          = (nodeIsString) ? null : node.children()[0]
      ncnns        = ncn && ncn.class.simpleName != 'String'
      ncnn         = (ncnns) ? ncn.name() : null
      nnb          = nn == 'B'
      tsb          = t.startsWith('+')
      nnaCode      = nn == 'A' && t.size() == 2 && t != '--'
      if (nnb) {
        if (tsb) {
          codeCode = t
          codeUrl  = (!nodeIsString &&  node.'@href' && node.'@class' != 'new') ? node.'@href' : ''
        } else {
          codeCode = (ncnns) ? ncn.text() : ''
          codeUrl  = (ncnns && ncn.'@href' && ncn.'@class' != 'new') ? ncn.'@href' : ''
        }
      }
     }
     return null
  }

  def getAAGCode(crawl, nc) {
    def code = [:]
    code.code     = stripPlusColon(nc.codeCode)
    code.with {
    //code        = stripPlusColon(nc.codeCode)
      codeUrl     = nc.codeUrl
      countryName = getAliasedCountryName(crawl, nc.node.'@title')
      countryUrl  = nc.node.'@href'
      alpha2      = nc.t
      flagImg     = ''
      phases      = ['aag']
    }
    return code
  }

  def stripPlusColon(code) {
    def pp = code.indexOf('+')
    if (pp == -1) pp = 0 else pp += 1
    def cp = code.indexOf(':')
    if (cp != -1) cp -= 1
    return code[pp..cp]
  }

  def extractZones1to9(crawl) {
    crawl.with {
      bodyContentDiv = page.depthFirst().DIV.find{it.'@id' == 'bodyContent'}
      zones.each{z ->
        if (z.id) {
          def zs = page.depthFirst().SPAN.find{it.'@id' == z.id}
          def sect  = [:]
          sect.span = zs
          sect.h3   = zs.parent()
          sect.uls  = []
          def assoch3 = 0
          bodyContentDiv.children().each{child ->
            if (child == sect.h3) {
              assoch3 = 1
            } else if (child.name() == 'H3' && assoch3 == 1) {
              assoch3 = 2
            } else if (assoch3 == 1 && child.name() == 'UL') {
              sect.uls << child
            }
          }
          z.sect = sect
        }
      }
      zones.each{z ->
        switch (z.zone) {
          case '4'                                 : z.ul = zones[2].sect.uls[1]
                                                     break
          case {it in '1'..'3' || it in '5'..'9' } : z.ul = z.sect.uls[0]
                                                     break
        }
      }
      zones.each{z -> z.sect = null}
    }
    ('1'..'9').each{
      processZone(crawl, it)
    }
    return null
  }

  def getZone(crawl, zone) {
    def res = null
    crawl.with {
      zones.each{z -> if (z.zone == zone) res = z}
    }
    return res
  }

  def processZone(crawl, zone) {
    crawl.with {
      def z    = getZone(crawl, zone)
      def zUl  =  z.ul
      def ul   = zUl.depthFirst().findAll{it.name() == 'UL'}
      ul.each{u ->
        u.each{l ->
          def a1 = l?.A[0]
          def a2 = l?.SPAN[0]?.A
          def a3 = l.A[1]
          def i  = a2?.IMG
          if (a1 && a2 && i && a1.'@href' != a2.'@href'[0] && a3) { // Currently only process if li contains code link, flag and country link
            def t =  a1.text()
            def c1 = t.tokenize(',')
            def c5 = []
            c1.each{c2 ->
              c3 = c2.tokenize('and')
              c3.each{c4 ->
                c5 << c4.trim()
              }
            }
            c5.each{cd ->
              def cdStrip     =  stripPlusColon(cd)
              def countryName =  getAliasedCountryName(crawl, a3.'@title')
              def code         = getCode(z, cdStrip, countryName)
              def newc = !(code.code)
              if (newc) {
                code.code        = cdStrip
                code.codeUrl     = (a1.'@class' == 'new') ? '' : a1.'@href'
                code.countryName = countryName
                code.countryUrl  = a3.'@href'
                code.alpha2      = ''
              }
              code.flagImg       = i.'@src'[0]
              if (newc) {
                code.phases      = []
              }
              code.phases << '1-9'
            }
          }
        }
      }
    }

    return null
  }

  def getCode(zone, code, countryName) {
    def c = zone.codes.find{c -> c.code == code && c.countryName == countryName}
    if (!c) {
      c = [:]
      zone.codes << c
    }
    return c
  }

  def extractBases(crawl) {
    crawl.with {
      def z = getZone(crawl, 'b')
      bases = page.depthFirst().TABLE.find{it.'@id' == 'sortable_table_id_0'}
      bases.TBODY.TR.eachWithIndex{tr, tri ->
        if (tri) { // Ignore first row column headings..
          def td0 = tr.TD[0]
          def td0A0 = td0.A[0]
          def baseName
          def baseUrl
          if (td0A0) {
            baseName = td0A0.text()
            baseUrl  = (td0A0.'@class' == 'new') ? '' : td0A0.'@href'
          } else {
            baseName = td0.text()
            baseUrl  = ''
          }
          def td1 = tr.TD[1]
          def baseCode = td1.text()
          def bc = extractBaseCodesFromString(baseCode)
          def td2 = tr.TD[2]
          def fc  = []
          def td2s0A0 = td2.SPAN[0].A[0]
          def td2s1A0 = (td2.SPAN.size() == 2) ? td2.SPAN[1].A[0] : null
          def spAnchs = [td2s0A0, td2s1A0]
          spAnchs.eachWithIndex{a, i ->
            if (a) {
              def c = [:]
              c.name    = getAliasedCountryName(crawl, a.'@title')
              c.url     = a.'@href'
              c.flagImg = a.IMG[0].'@src'
              c.code    = bc[i]
              fc << c

            }
          }
          def b = [:]
          b.baseName  = baseName
          b.baseUrl   = baseUrl
          b.countries = fc
          z.bases << b
        }
      }
    }
    return null
  }

  def extractBaseCodesFromString(baseCode) {
    def res = []
    def bcs = baseCode.tokenize('/')
    def r
    bcs.each{bc ->
      def pp = bc.indexOf('+')
      def bp = bc.indexOf('(')
      if (bp == -1) {
        r = bc[pp+1..-1]
      } else {
        r = bc[pp+1..bp-1]
      }
      res << r.trim()
    }
    return res
  }

  def extractCompleteListing(crawl) {
    crawl.with {
      completeListing = page.depthFirst().TABLE.find{it.'@id' == 'sortable_table_id_1'}
      completeListing.TBODY.TR.eachWithIndex{tr, tri ->
        if (tri) { // Ignore first row column headings..
          def td0 = tr.TD[0]
          def countryName = td0.text()
          def ifr = countryName.indexOf(' (from')
          if (ifr != -1) countryName = countryName[0..ifr -1]
          countryName = getAliasedCountryName(crawl, countryName)
          def td1 = tr.TD[1]
          def anchs = td1.breadthFirst().findAll{it.name() == 'A' && !(it.'@href'.startsWith('#cite'))}
          def codes = []
          anchs.each{a ->
            def c = [:]
            c.code =  a.text()[1..-1]
            c.codeUrl  = (a.'@class' == 'new') ? '' : a.'@href'
            codes << c
          }
          codes.each {cd ->
            def z    = getZone(crawl, cd.code[0])
            def code = getCode(z, cd.code, countryName)
            def newc = !(code.code)
            if (newc) {
              code.code        = cd.code
              code.codeUrl     = cd.codeUrl
              code.countryName = countryName
              code.countryUrl  = ''
              code.alpha2      = ''
              code.flagImg     = ''
              code.phases      = []
            } else if (!code.codeUrl) {
              code.codeUrl = cd.codeUrl
            }
            code.phases << 'comp'
          }
        }
      }
    }
    return null
  }

  def getNekoHtml(crawl) {
    def parser = new org.cyberneko.html.parsers.SAXParser()
    parser.setFeature('http://xml.org/sax/features/namespaces', false)
    def nekoHtml = new XmlParser(parser).parseText(selenium.getHtmlSource())
    return nekoHtml
  }

  def printResults(crawl) {
    def z
    ('1'..'9').each{
      z  = getZone(crawl, it)
      z.codes.sort{a, b -> if (a.code == b.code) a.countryName <=> b.countryName else a.code <=> b.code}
      z.codes.each{if (it.phases.size() != 3) println it}
      if (z.codes) println '---'
    }
    return null
    z  = getZone(crawl, 'b')
    if (z.bases) {
      z.bases.each{println it}
      println '---'
    }
  }

  def dumpResultsToDisk(crawl) {
    crawl.with {
      println ''
      println '==='
      println "Writing calling codes to : $countriesCallingCodesWiki"
      def xmlFile = new File(countriesCallingCodesWiki)
      xmlFile.write("<zones>$nl", encoding)
      def z
      def xml
      ('1'..'9').each{zno ->
        z  = getZone(crawl, zno)
        xml  = writeZoneHeaderXml(crawl, z)

        xml += "    <calling-codes>$nl"
        z.codes.each{c ->
        xml += "      <calling-code>$nl"
        xml += "        <code>${c.code}</code>$nl"
        xml += "        <code-url>${c.codeUrl}</code-url>$nl"
        xml += "        <country-name>${c.countryName}</country-name>$nl"
        xml += "        <country-url>${c.countryUrl}</country-url>$nl"
        xml += "        <country-alpha2>${c.alpha2}</country-alpha>$nl"
        xml += "        <country-flag-img>${c.flagImg}</country-flag-img>$nl"
        xml += "        <phases count='${c.phases.size()}'>$nl"
          c.phases.each{p ->
        xml += "          <phase>$p</phase>$nl"
          } // phases
        xml += "        </phases>$nl"
        xml += "      </calling-code>$nl"
        } // z.codes
        xml += "    <calling-codes>$nl"
        xml += "  </zone>$nl"
        if (xml.indexOf('null') != -1) errors << zno
        xmlFile.append(xml, encoding)
      } //1..9
      def bs = 'b'
      z  = getZone(crawl, bs)
      xml    = writeZoneHeaderXml(crawl, z)
        xml += "    <bases>$nl"
        z.bases.each{b ->
        xml += "      <base>$nl"
        xml += "        <name>${b.baseName}</name>$nl"
        xml += "        <url>${b.baseUrl}</url>$nl"
        xml += "        <countries>$nl"
          b.countries.each{c ->
        xml += "          <country>$nl"
        xml += "            <calling-code>${c.code}</calling-code>$nl"
        xml += "            <flag-img>${c.flagImg}</flag-img>$nl"
        xml += "            <name>${c.name}</name>$nl"
        xml += "            <url>${c.url}</url>$nl"
        xml += "          </country>$nl"
          } // b
        xml += "        </countries>$nl"
        xml += "      </base>$nl"
        } // z
        xml += "    </bases>$nl"
        xml += "  </zone>$nl"
        xml += "</zones>$nl"
        if (xml.indexOf('null') != -1) errors << bs
      xmlFile.append(xml, encoding)
    } // crawl
    return null
  }

  def writeZoneHeaderXml(crawl, z) {
    def xml
    crawl.with {
        xml  = "  <zone>$nl"
        xml += "    <prefix>${z.zone}</prefix>$nl"
        xml += "    <name>${z.name}</name>$nl"
        xml += "    <id>${(z.id)?:''}</id>$nl"
        xml += "    <urls>$nl"
        z.urls.each{zu ->
        xml += "      <url>$nl"
        xml += "        <name>${zu.name}</name>$nl"
        xml += "        <link>${zu.link}</link>$nl"
        xml += "      </url>$nl"
        }
        xml += "    </url>$nl"
    }
    return xml
  }

  def reportErrors(crawl) {
    println ''
    println '==='
    crawl.with {
      if (errors) {
        println 'Null data in following Zones:'
        errors.each{println it}
      } else {
        println 'No null data in XML'
      }
    }
    return null
  }

  def writeImagesToDisk(crawl) {
    crawl.with {
      ('1'..'9').each{
        z  = getZone(crawl, it)
        z.codes.each{c ->
         processImageForCountry(c, crawl)
        }
      }
      z = getZone(crawl, 'b')
      z.bases.each{b ->
        b.countries.each{c ->
          processImageForCountry(c, crawl)
        }
      }
    }
  }

  def processImageForCountry = {c, crawl ->
    def u
    def f
    def cenc
    def filename
    if (c.flagImg) {
      u = c.flagImg.toURL()
      cenc    = URLEncoder.encode("${c.countryName}.png")
      crawl.with {
        filename = "$countriesCallingCodesFlagDir$fs$cenc"
      }
      f  = new File(filename as String)
      writeImageToDisk(u, f)
    }
  }

  def writeImageToDisk(URL u, File f) {
    def img = null
    try {
      img = ImageIO.read(u)
      ImageIO.write(img, "png", f)
    } catch (IOException e) {
      println "$e | $u"
    }
    return null
  }
}

Related posts:

Advertisements

About this entry