Thursday, July 14, 2011

After a bug fix in M/R processing in mongo 1.8.1, I am posting a follow up on the logdb collection analysis.

Software versions:
mongod 1.8.2
node 1.4.5
node-mongodb-native

A node.js script node_mongodb_log_app.js is shown below.

GLOBAL.DEBUG = true;

var sys = require('sys');

var Db = require('mongodb').Db,
    Connection = require('mongodb').Connection,
    Server = require('mongodb').Server,
    BSON = require('mongodb').BSONNative;

var db = new Db('logdb', new Server('127.0.0.1', 27017, {}), {native_parser: true});

var filter = null;
var lon = (process.ARGV[2]) ? Number(process.ARGV[2]): null;
var lat = (process.ARGV[3]) ? Number(process.ARGV[3]): null;

if(lat != null && lon != null){
    filter = { query: {  loc : { $near: [lon, lat]}}};
    sys.puts(sys.inspect(filter));
}

db.open(function(error, db) {
    var map_request_size = function(){
        emit(1, this.size);
    }

    var reduce = function (k, vals) {
        var sum = 0;
        vals.forEach(function (v) {sum += v;});
        return sum;
    }

    // query all results, total bandwith
    db.collection('logentry', function(error, collection){
        if(filter != null){
            collection.mapReduce(map_request_size, reduce, filter,
                function(error, collection){
                    collection.find(function(error, cursor){
                        cursor.each(function(error, mrdoc){
                            if(mrdoc != null){
                                sys.puts(mrdoc.value);
                            }
                            db.close();
                        });
                    });
            });
        }
        else{
            collection.mapReduce(map_request_size, reduce,
                function(error, collection){
                    collection.find(function(error, cursor){
                        cursor.each(function(error, mrdoc){
                            if(mrdoc != null){
                                sys.puts(mrdoc.value);
                            }
                            db.close();
                        });
                    });
            });
        }
    });

});

We want to accumulate all file download into a single number of bytes. For that we execute the script without parameters:

$ node node_mongodb_log_app.js
204612741925

If we want to analyze consumption near a geographic location, we issue a call like this:

$ node node_mongodb_log_app.js -108 32.6
{ query: { loc: { '$near': [Object] } } }
30435795

Tuesday, May 17, 2011

Geotagging records in mongoDB with node.js

We use the popular non-relational MongoDB database to store web traffic in custom logs.

The log entries (logentry documents) were originally inserted from a Pylons framework using
the mongokit package. Each GET or POST call to a controller is recorded following this document model:

class LogEntry(Document):
    collection_name = 'logentry'
    structure = {
        'host_url': unicode,
        'controller' : unicode,
        'url' : unicode,
        'query_string' : unicode,
        'request_start': datetime.datetime,
        'request_end' : datetime.datetime,
        'size' : float,
        'is_xhr' : bool,
        'method': unicode,
        'user_agent': unicode,
        'referrer' : unicode,
        'session' : unicode,
        'post_vars': dict,
        'ip' : unicode
    }
    default_values = {
        'request_start' : datetime.datetime.now(),
        'request_end' : datetime.datetime.now(),
        'size' : 0
     }

The database in mongo is called "logdb" and the collection "logentry".

The code shown below illustrates the use of node.js http client to fetch asynchronously a JSON response, via the freegeoip.net REST API service, with the geographic location of the ip value stored in the logentry document. Using the mongodb node.js driver, the record is then updated with a point containing longitude and latitude information.

Software versions:
mongod 1.8.1
node 1.4.5
node-mongodb-native

GLOBAL.DEBUG = true;

var sys = require('sys');
var http = require('http');

var Db = require('mongodb').Db;
var Server = require('mongodb').Server;
var BSON = require('mongodb').BSONNative;

var db = new Db('logdb', new Server('127.0.0.1', 27017, {}), {native_parser: true});

// Make sure spatial index exists http://www.mongodb.org/display/DOCS/Geospatial+Indexing
// db.logentry.ensureIndex({loc: "2d"});

db.open(function(error, db) {
    // get the logentry collection
    db.collection('logentry', function(error, collection){
        // Find all ips without geo location. (Only 10 at a time)
        collection.find({geoip: {$exists: false}}, {'limit': 10}, function(error, cursor){
            //sys.puts(error);
            cursor.each(function(error, logdoc){
                if(logdoc != null){
                    // https://github.com/fiorix/freegeoip
                    var options = {
                        port: 80,
                        host: 'freegeoip.net',
                        path: '/json/' + logdoc.ip
                    };

                    var request = http.request(options, function(response) {
                        response.setEncoding('utf8');
                        var jsonResponse = '';
                        response.on('data', function (chunk) {
                            jsonResponse += chunk;
                        });
                        response.on('end', function(){
                            var geoip = JSON.parse(jsonResponse);
                            sys.puts("POINT("+ geoip.longitude + " " + geoip.latitude + ")");
                            console.log(geoip.ip);
                            var loc = {
                                'loc': {
                                    lon: Number(geoip.longitude),
                                    lat: Number(geoip.latitude)
                                }
                            };
                            var updateCommand = { "$set": loc };
                            console.log(sys.inspect(updateCommand));
                            collection.update(
                                {'ip': logdoc.ip}, updateCommand, function(error){
                                    if(error){
                                        sys.puts(error);
                                    }
                                    db.close();
                                });
                            });
                    });
                    request.end();
                }
            });
        });
    });
});

From the mongo console we check logentry records now contain locatable coordinates. I will post a MapReduce analysis procedure later on.

MongoDB shell version: 1.8.1
connecting to: test
> use logdb                               
switched to db logdb
> db.logentry.find({loc: {$exists: true}})
{ "_id" : "LogEntry-000050c5-487a-421e-bf91-b062db988c7d", 
  "controller" : "root", 
  "host_url" : "http://12x.x4.x3.10x:8884", 
  "ip" : "17x.2x.x6.x16", 
  "is_xhr" : true, 
  "loc" : { "lon" : -105.96, "lat" : 35.678 }, 
  "method" : "POST", 
  "post_vars" : { "node" : "Digital Orthophotography_|_2003 Color Infrared (CIR)_|_New Mexico (1m)", "end_date" : "", "filter" : "", "limit" : "25", "offset" : "0", "start_date" : "" }, 
  "query_string" : "", 
  "referrer" : "http://rgis.unm.edu/browsedata", 
  "request_end" : ISODate("2010-01-26T13:59:14.019Z"), 
  "request_start" : ISODate("2010-01-26T13:59:13.938Z"), 
  "session" : "237211756113636361222701966583755313674", 
  "size" : 6495, 
  "url" : "browsedata/json/tree/themes", 
  "user_agent" : "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7 (.NET CLR 3.5.30729)" 
}

...

Saturday, May 14, 2011

Converting an Ontology into a Category

Category Theory is the mathematical theory of structure. We are interested in studying the practical aspects of categorical completion of ontologies, a process of transforming ontologies into categories. This process is not generic, (for reasons not explained here), apart from the main axioms a category must satisfy, we require the presence of other elements such as all colimits and a terminal object.

The first thing to do is to represent the identity morphism that all objects in category must have. This can be done simply by adding a reflexive and transitive object property hasIdentityMorphism with range Thing and domain Thing, so that all things have such identity property.

A second task to accomplish is to endow an ontology O with a terminal object class to represent the terminal object in the categorical completion C of O. I took the Pizza ontology and added a subclass TerminalObject to Thing. Also, in order to ensure "all things" have a terminal object, a transitive object property (terminalProperty) was added.

I use the Java OWLAPI to materialize these operations. These are the relevant lines of code for the two first steps explained above:

OWLOntologyManager manager = OWLManager.createOWLOntologyManager();
IRI iri = IRI.create("http://www.co-ode.org/ontologies/pizza/pizza.owl");
OWLOntology pizzaOntology = manager.loadOntologyFromOntologyDocument(iri);

OWLClass Thing = factory.getOWLClass(IRI.create("http://www.w3.org/2002/07/owl#Thing"));

OWLClass TerminalObject = factory.getOWLClass(IRI.create(iri + "#TerminalObject"));
OWLObjectProperty terminalProperty = factory.getOWLObjectProperty(IRI.create(iri + "#hasTerminalObject"));
        
OWLTransitiveObjectPropertyAxiom TerminalObjectProperty = factory.getOWLTransitiveObjectPropertyAxiom(terminalProperty);
manager.addAxiom(pizzaOntology, TerminalObjectProperty);
Set terminalAxioms = new HashSet();
        
terminalAxioms.add(factory.getOWLObjectPropertyDomainAxiom(terminalProperty, Thing));
terminalAxioms.add(factory.getOWLObjectPropertyRangeAxiom(terminalProperty, TerminalObject));
terminalAxioms.add(factory.getOWLSubClassOfAxiom(TerminalObject, Thing));
        
manager.addAxioms(pizzaOntology, terminalAxioms);

File file = new File("/tmp/catcomp_pizza.owl");
manager.saveOntology(pizzaOntology, owlxmlFormat, IRI.create(file.toURI()));

This is screenshot taken from a Protege/OntoGraf view on the new transformed ontology.