Getting useful index information from MongoDB

Here is a MongoDB script for presenting index information in a more concise way than getIndexes() provides. This script also presents an index’s total size along with a breakdown of its size on all of the shards.

//mongo --eval="var collection='file';"

var ret = db[collection].getIndexes().map(function(i){
    return {"key":i.key, "name":i.name};
});

var o = {};
for(r in ret) {
    o[ret[r].name] = ret[r].key;
}

var cstats = db[collection].stats();
for(k in cstats.indexSizes) {
    o[k].totalsize = cstats.indexSizes[k];
}

var shardinfo = cstats.shards;
for(s in shardinfo) {
    for(k in shardinfo[s].indexSizes) {
        if(!o[k].shards) o[k].shards = {};
        o[k].shards[s] = shardinfo[s].indexSizes[k];
    }
}

printjson(o);

Produces the following output:

[]

Simple Scala Map/Reduce Job

I was recently tasked with writing a Hadoop map/reduce job. This job had the requirement of taking a list of regular expressions and scouring hundreds of gigs worth of log files for matches. Since I’ve been leaning more and more towards Scala I wanted to use it for my job but I also wanted to use Maven for my job’s package management to make the job easy to setup and extend. And finally, I wanted to have unit tests for my mapper and reducer and an overall job unit test. The result is this project I posted to GitHub as a template for future projects. I hope it proves as helpful for others as I’m sure it’ll be for me.

[]

Select distinct for MongoDB

Here is a handy script I’ve been using for MongoDB to retrieve a list of all the fields used in a collection. This uses a map/reduce routine and has to comb over all the documents in a collection so you may want to exercise caution when using this script.

// usage:
// mongo localhost/foo --quiet --eval="var collection='bar';" getcollectionkeys.js
var mr = db.runCommand({
  "mapreduce":collection,
  "map":function() {
    for (var key in this) { emit(key, null); }
  },
  "reduce":function(key, stuff) { return null; },
  "out":collection + "_keys"
})

print(db[mr.result].distinct("_id"))

db[collection+"_keys"].drop()
[]

Simple PhantomJS web scraping script

Here is a simple web scraping script I wrote for PhantomJS, the immensely useful headless browser, to load a page, inject jQuery into it, and then scrape the page using a user-supplied jQuery selector.

page = require('webpage').create()
system = require 'system'

phantom.injectJs "static/js/underscore-min.js"

page.onConsoleMessage = (msg) ->
    if not msg.match /^Unsafe/
        console.log msg

scrapeEl = (elselector) ->
    rows = $ elselector
    for el in rows
        if el.innerHTML
            str = el.innerHTML.trim()
            if str.length > 0
                console.log str

page.open system.args[1], (status) ->
    if status isnt 'success'
        phantom.exit 1
    else
        page.injectJs "static/js/underscore-min.js"
        page.injectJs "static/js/utils.js"
        page.injectJs "static/js/jquery-1.8.2.min.js"
        page.evaluate scrapeEl, system.args[2]
        phantom.exit()

Run it with:

[]

Bait and Switch: An iOS Phishing Scam Using the iTunes Terms of Service

[Guest post by Ryan Bailey] Earlier this year roughly 50,000 stolen iTunes accounts were posted to a Chinese online auction site with prices ranging from 15 cents to $30 each. Many forms of attacks can be leveraged in acquiring passwords such as these through covert means, but almost none provide such a straightforward plan of attack like Phishing. Phishing, like many other forms of modern day email spam, is a form of social engineering aimed at acquiring sensitive information by attempting to fool users into freely surrendering passwords, credit card information or other potentially valuable information. Most current day attacks come in the form of an email seeking users to verify their account or billing details. These social engineering attempts often utilize pixel perfect facsimiles of websites or newsletters in order to gain a user’s trust. That’s where this phishing proof of concept gets its cue.

[]

Simple init.d script template

Recently I found the need to create an init.d script and since I had a hard time finding an example elsewhere (That said, if you know of such an example I’d love to hear from you.), here’s the overly simple script I came up with to get the job done:

#!/bin/bash
# myapp daemon
# chkconfig: 345 20 80
# description: myapp daemon
# processname: myapp

DAEMON_PATH="/home/wes/Development/projects/myapp"

DAEMON=myapp
DAEMONOPTS="-my opts"

NAME=myapp
DESC="My daemon description"
PIDFILE=/var/run/$NAME.pid
SCRIPTNAME=/etc/init.d/$NAME

case "$1" in
start)
	printf "%-50s" "Starting $NAME..."
	cd $DAEMON_PATH
	PID=`$DAEMON $DAEMONOPTS > /dev/null 2>&1 & echo $!`
	#echo "Saving PID" $PID " to " $PIDFILE
        if [ -z $PID ]; then
            printf "%s\n" "Fail"
        else
            echo $PID > $PIDFILE
            printf "%s\n" "Ok"
        fi
;;
status)
        printf "%-50s" "Checking $NAME..."
        if [ -f $PIDFILE ]; then
            PID=`cat $PIDFILE`
            if [ -z "`ps axf | grep ${PID} | grep -v grep`" ]; then
                printf "%s\n" "Process dead but pidfile exists"
            else
                echo "Running"
            fi
        else
            printf "%s\n" "Service not running"
        fi
;;
stop)
        printf "%-50s" "Stopping $NAME"
            PID=`cat $PIDFILE`
            cd $DAEMON_PATH
        if [ -f $PIDFILE ]; then
            kill -HUP $PID
            printf "%s\n" "Ok"
            rm -f $PIDFILE
        else
            printf "%s\n" "pidfile not found"
        fi
;;

restart)
  	$0 stop
  	$0 start
;;

*)
        echo "Usage: $0 {status|start|stop|restart}"
        exit 1
esac

This script will work in /etc/init.d on Xubuntu 11.10 (so most Debian-based systems) and CentOS 5.5 and you can control it via chkconfig.

[]

Fun with jsonselect

One of the strengths of CSS and jQuery is that it provides a common and powerful mechanism known as a selector language for referencing bits of data, especially data whose structure is not exactly known at runtime which makes such an addressing scheme a perfect fit for the often lumpy world of HTML.

Increasingly JSON is being used as a transport medium for data and with the rise of NoSQL solutions, having a selector language for JSON makes a lot of sense when dealing with JSON documents whose structure isn’t deterministic.

[]

Finding yesterday’s beginning and ending unix timestamp

When writing reports I’ve often come across the need to find the unix timestamp beginning and end of a day. Here is a Python snippet that does just that.

yesterday = datetime.datetime.now() - datetime.timedelta(days = 1)
yesterday_beginning = datetime.datetime(yesterday.year, yesterday.month, yesterday.day,0,0,0,0)
yesterday_beginning_time = int(time.mktime(yesterday_beginning.timetuple()))
yesterday_end = datetime.datetime(yesterday.year, yesterday.month, yesterday.day,23,59,59,999)
yesterday_end_time = int(time.mktime(yesterday_end.timetuple()))

print yesterday_beginning_time
print yesterday_end_time
[]

MongoDB script to check the status of background index builds

Here is a simple script I’ve found to be quite helpful for monitoring the status of background index builds across shards on a system:

var currentOps = db.currentOp();

if(!currentOps.inprog || currentOps.inprog.length < 1) {
    print("No operations in progress");
} else {
    for(o in currentOps.inprog) {
        var op = currentOps.inprog[o];
        if(op.msg && op.msg.match(/bg index build/)) {
            print(op.opid+' - '+op.msg);
        }
    }
}

Here’s the output:

$ mongo mycluster:30000/mydb bgIndexBuildStatus.js
MongoDB shell version: 1.8.1
connecting to: mycluster:30000/mydb
shard0000:343812263 - bg index build 122042652/165365928 73%
shard0001:355224633 - bg index build 111732254/165568168 67%
[]

Check an array for all null values

Here is a simple function to check an array to see if it contains all null values.

function allNulls($arr) {
    if(is_array($arr) && count(array_diff($arr, array(null))) == 0) {
            return true;
    }

    return false;
}

echo (allNulls(array(null,null,null)) ? "true" : "false") . PHP_EOL;
echo (allNulls(array(null,1,null)) ? "true" : "false") . PHP_EOL;
echo (allNulls(array("test",null,null)) ? "true" : "false") . PHP_EOL;
echo (allNulls(array("",null,null)) ? "true" : "false") . PHP_EOL;
echo (allNulls(array(0,null,null)) ? "true" : "false") . PHP_EOL;
[]