Tuesday, May 17, 2011

Geotagging records in mongoDB with node.js

We use the popular non-relational MongoDB database to store web traffic in custom logs.

The log entries (logentry documents) were originally inserted from a Pylons framework using
the mongokit package. Each GET or POST call to a controller is recorded following this document model:

class LogEntry(Document):
    collection_name = 'logentry'
    structure = {
        'host_url': unicode,
        'controller' : unicode,
        'url' : unicode,
        'query_string' : unicode,
        'request_start': datetime.datetime,
        'request_end' : datetime.datetime,
        'size' : float,
        'is_xhr' : bool,
        'method': unicode,
        'user_agent': unicode,
        'referrer' : unicode,
        'session' : unicode,
        'post_vars': dict,
        'ip' : unicode
    }
    default_values = {
        'request_start' : datetime.datetime.now(),
        'request_end' : datetime.datetime.now(),
        'size' : 0
     }

The database in mongo is called "logdb" and the collection "logentry".

The code shown below illustrates the use of node.js http client to fetch asynchronously a JSON response, via the freegeoip.net REST API service, with the geographic location of the ip value stored in the logentry document. Using the mongodb node.js driver, the record is then updated with a point containing longitude and latitude information.

Software versions:
mongod 1.8.1
node 1.4.5
node-mongodb-native

GLOBAL.DEBUG = true;

var sys = require('sys');
var http = require('http');

var Db = require('mongodb').Db;
var Server = require('mongodb').Server;
var BSON = require('mongodb').BSONNative;

var db = new Db('logdb', new Server('127.0.0.1', 27017, {}), {native_parser: true});

// Make sure spatial index exists http://www.mongodb.org/display/DOCS/Geospatial+Indexing
// db.logentry.ensureIndex({loc: "2d"});

db.open(function(error, db) {
    // get the logentry collection
    db.collection('logentry', function(error, collection){
        // Find all ips without geo location. (Only 10 at a time)
        collection.find({geoip: {$exists: false}}, {'limit': 10}, function(error, cursor){
            //sys.puts(error);
            cursor.each(function(error, logdoc){
                if(logdoc != null){
                    // https://github.com/fiorix/freegeoip
                    var options = {
                        port: 80,
                        host: 'freegeoip.net',
                        path: '/json/' + logdoc.ip
                    };

                    var request = http.request(options, function(response) {
                        response.setEncoding('utf8');
                        var jsonResponse = '';
                        response.on('data', function (chunk) {
                            jsonResponse += chunk;
                        });
                        response.on('end', function(){
                            var geoip = JSON.parse(jsonResponse);
                            sys.puts("POINT("+ geoip.longitude + " " + geoip.latitude + ")");
                            console.log(geoip.ip);
                            var loc = {
                                'loc': {
                                    lon: Number(geoip.longitude),
                                    lat: Number(geoip.latitude)
                                }
                            };
                            var updateCommand = { "$set": loc };
                            console.log(sys.inspect(updateCommand));
                            collection.update(
                                {'ip': logdoc.ip}, updateCommand, function(error){
                                    if(error){
                                        sys.puts(error);
                                    }
                                    db.close();
                                });
                            });
                    });
                    request.end();
                }
            });
        });
    });
});

From the mongo console we check logentry records now contain locatable coordinates. I will post a MapReduce analysis procedure later on.

MongoDB shell version: 1.8.1
connecting to: test
> use logdb                               
switched to db logdb
> db.logentry.find({loc: {$exists: true}})
{ "_id" : "LogEntry-000050c5-487a-421e-bf91-b062db988c7d", 
  "controller" : "root", 
  "host_url" : "http://12x.x4.x3.10x:8884", 
  "ip" : "17x.2x.x6.x16", 
  "is_xhr" : true, 
  "loc" : { "lon" : -105.96, "lat" : 35.678 }, 
  "method" : "POST", 
  "post_vars" : { "node" : "Digital Orthophotography_|_2003 Color Infrared (CIR)_|_New Mexico (1m)", "end_date" : "", "filter" : "", "limit" : "25", "offset" : "0", "start_date" : "" }, 
  "query_string" : "", 
  "referrer" : "http://rgis.unm.edu/browsedata", 
  "request_end" : ISODate("2010-01-26T13:59:14.019Z"), 
  "request_start" : ISODate("2010-01-26T13:59:13.938Z"), 
  "session" : "237211756113636361222701966583755313674", 
  "size" : 6495, 
  "url" : "browsedata/json/tree/themes", 
  "user_agent" : "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7 (.NET CLR 3.5.30729)" 
}

...

No comments:

Post a Comment