Source code for hemlock.clients.hfs_old

#!/usr/bin/env python
#
#   Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved.
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

from hemlock_debugger import Hemlock_Debugger
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO

import hemlock_base

import ast
import base64
import csv
import fnmatch
import json
import hashlib
import magic
import os
import sys
import time
import uuid
import xlrd
import xmltodict

[docs]class HFs: def __init__(self): self.log = Hemlock_Debugger()
[docs] def connect_client(self, debug, client_dict): # DEBUG input = "/mnt/" try: input = client_dict['FILE_PATH'] except: print "Failure with the creds file" sys.exit(0) return input
[docs] def get_data(self, debug, client_dict, c_server, h_server, client_uuid, no_couchbase): # DEBUG self.process_files(debug, c_server, h_server, client_uuid, no_couchbase) return [[]], []
[docs] def format_lists(self, debug, j_list, h_server, client_uuid, no_couchbase): # DEBUG data_list = [[]] desc_list = [] i = 0 for record in j_list: data_list[0].append([]) desc_list.append([]) while record[0] == '"' or record[0] == "'": record = record.decode('unicode-escape')[1:-1] record = record.encode('ascii', 'ignore') record = ast.literal_eval(record) for key in record: data_list[0][i].append(str(record[key])) desc_list[i].append([str(key)]) i += 1 h_inst = hemlock_base.Hemlock_Base() h_inst.send_data(debug, data_list, desc_list, h_server, client_uuid, no_couchbase) return
[docs] def process_files(self, debug, input, h_server, client_uuid, no_couchbase): # !! TODO # check by file extension first, import from file_types should match # extension name. second check mimetype, again file_types should # match the name. finally if none of those match, generic, then fall # back to no mimetype or known extension type and try to base64 # encode it # DEBUG matches = [] errors = 0 for root, dirnames, filenames in os.walk(input): for filename in fnmatch.filter(filenames, '*.*'): matches.append(os.path.join(root, filename)) i = 0 j_list = [] # DEBUG for file in matches: print file file_mime = magic.from_file(file, mime=True) f = open(file, 'rb') # DEBUG try: # !! TODO # this should be a more explicit check against the # extension, not the name if "csv" in file: # DEBUG try: f.close() with open(file, 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='"') hrow = reader.next() for row in reader: j = 0 j_str = "{" while j < len(hrow): j_str += "\""+hrow[j]+"\":" j_str += "\""+ row[j]+"\"," j += 1 j_str = j_str[:-1]+"}" if j_str != "}": j_str = json.dumps(repr(j_str)) j_list.append(j_str) self.format_lists(debug, j_list, h_server, client_uuid, no_couchbase) j_list = [] i += 1 except: f = open(file, 'rb') j_str = json.dumps( { "payload": f.read() } ) j_list.append(j_str) self.format_lists(debug, j_list, h_server, client_uuid, no_couchbase) j_list = [] i += 1 elif "xls" in file: # DEBUG try: wb = xlrd.open_workbook(file) wb_sn = wb.sheet_names() for sn in wb_sn: sh = wb.sheet_by_name(sn) j = 0 header = [] for rownum in xrange(sh.nrows): j_str = "{" if j == 0: header = sh.row_values(rownum) else: row = sh.row_values(rownum) k = 0 header2 = [] while k < len(header): if header[k] != "": if header[k] in header2: j_str += "\""+unicode(header[k])+str(k)+"\":\""+unicode(row[k])+"\"," else: j_str += "\""+unicode(header[k])+"\":\""+unicode(row[k])+"\"," header2.append(header[k]) else: j_str += "\"empty-"+str(k)+"\":\""+unicode(row[k])+"\"," k += 1 j += 1 j_str = j_str[:-1]+"}" if j_str != "}": j_str = json.dumps(j_str) j_list.append(j_str) self.format_lists(debug, j_list, h_server, client_uuid, no_couchbase) j_list = [] i += 1 except: b64_text = base64.b64encode(f.read()) j_str = json.dumps( { "payload": b64_text } ) j_list.append(j_str) self.format_lists(debug, j_list, h_server, client_uuid, no_couchbase) j_list = [] i += 1 else: j_str = json.dumps( { "payload": f.read() } ) j_list.append(j_str) self.format_lists(debug, j_list, h_server, client_uuid, no_couchbase) j_list = [] i += 1 except: # !! TODO if file is xml # !! TODO if file is json # !! TODO if file is doc # !! TODO if file is ppt # DEBUG if file_mime: if "pdf" in file_mime: # DEBUG try: text = self.convert_pdf(debug, file) j_str = json.dumps( { "payload" : text } ) except: b64_text = base64.b64encode(f.read()) j_str = json.dumps( { "payload": b64_text } ) elif "text" in file_mime: j_str = json.dumps( { "payload": repr(f.read()) } ) elif "pcap" in file_mime: # DEBUG try: u = str(uuid.uuid4()) cmd = "tshark -r "+file+" -T text -V > "+u junk = os.popen(cmd).read() g = open(u, 'rb') a = [] b = {} for line in g: if line == "\n": # a frame for element in a: # DEBUG try: e_list = element.split(":",1) b[e_list[0].strip()] = e_list[1].strip() except: # ignore junk junk = element j_str = json.dumps(b) a = [] b = {} j_list.append(j_str) self.format_lists(debug, j_list, h_server, client_uuid, no_couchbase) j_list = [] a.append(line) g.close() cmd = "rm -rf "+u junk = os.popen(cmd).read() except: if g: g.close() cmd = "rm -rf "+u junk = os.popen(cmd).read() print sys.exc_info()[0] print "need tshark installed to process pcap files" b64_text = base64.b64encode(f.read()) j_str = json.dumps( { "payload": b64_text } ) else: #print file, file_mime b64_text = base64.b64encode(f.read()) j_str = json.dumps( { "payload": b64_text } ) i += 1 j_list.append(j_str) self.format_lists(debug, j_list, h_server, client_uuid, no_couchbase) j_list = [] else: print file, "no mimetype" f.close() print errors,"errors." print i,"documents." return
[docs] def convert_pdf(self, debug, input): # DEBUG rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(input, 'rb') process_pdf(rsrcmgr, device, fp) fp.close() device.close() str = retstr.getvalue() retstr.close() return str