#!/usr/bin/env python
#
#   Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved.
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
import fnmatch, os, sys, time, uuid
from pymongo import MongoClient
[docs]def mongo_server(server, port, database, collection):
    # connect to the redis server
    try:
        m_server = MongoClient(server, port)
        m_database = m_server[database]
        m_collection = m_database[collection]
    except:
        print "Mongo server failure"
        sys.exit(0)
    return m_server, m_database, m_collection
 
[docs]def process_doc(input, m_server, m_database, m_collection):
    matches = []
    docs = []
    for root, dirnames, filenames in os.walk(input):
        for filename in fnmatch.filter(filenames, '*.txt'):
            matches.append(os.path.join(root, filename))
    j = 0
    k = 0
    for file in matches:
        if len(docs) % 100 == 0 and len(docs) > 0:
            m_collection.insert(docs)
            print str(j), "total docs."
            print str(k), "docs failed."
            docs = []
        doc = open(file, 'r').read()
        try:
            doc = unicode(doc, "utf-8")
            doc = {"doc": doc}
            docs.append(doc)
            j += 1
        except:
            k += 1
    if len(docs) > 0:
        m_collection.insert(docs)
    print str(j), "total docs."
    print str(k), "docs failed."
 
[docs]def print_help():
    print "-i \t<input path to files> (default is /mnt/)"
    print "-s \t<mongo server> (default is localhost)"
    print "-p \t<mongo port> (default is 27017)"
    print "-d \t<mongo database> (default is local)"
    print "-c \t<mongo collection> (default is collection)"
    print "-h \thelp\n"
    sys.exit(0)
 
[docs]def process_args(args):
    # default initialization
    input = "/mnt/"
    server = "localhost"
    port = 27017
    database = "local"
    collection = "collection"
    # process args
    i = 0
    while i < len(args):
        if args[i] == "-s":
            try:
                server = args[i+1]
                i += 1
            except:
                print_help()
        elif args[i] == "-p":
            try:
                port = int(args[i+1])
                i += 1
            except:
                print_help()
        elif args[i] == "-d":
            try:
                database = args[i+1]
                i += 1
            except:
                print_help()
        elif args[i] == "-c":
            try:
                collection = args[i+1]
                i += 1
            except:
                print_help()
        elif args[i] == "-i":
            try:
                input = args[i+1]
                i += 1
            except:
                print_help()
        else:
            print_help()
        i += 1
    return input, server, port, database, collection
 
[docs]def get_args():
    args = []
    for arg in sys.argv:
        args.append(arg)
    return args[1:]
 
if __name__ == "__main__":
    start_time = time.time()
    args = get_args()
    input, server, port, database, collection = process_args(args)
    m_server, m_database, m_collection = mongo_server(server, port, database, collection)
    process_doc(input, m_server, m_database, m_collection)
    print "Took",time.time() - start_time,"seconds to complete."