first commit

2026-01-12 22:43:42 +08:00 · 2014-01-28 17:31:25 -05:00
parent eb7b92ab44
commit a688562009
19 changed files with 1222 additions and 1 deletions
--- a/blockstack_search/README.md
+++ b/blockstack_search/README.md
@@ -1,4 +1,4 @@
-fgsearch
+nodepath
 ========

 Search API for FreeGraph
--- a/blockstack_search/crawler/init.py
+++ b/blockstack_search/crawler/init.py
--- a/blockstack_search/crawler/common.py
+++ b/blockstack_search/crawler/common.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+#-----------------------
+# Copyright 2013 Halfmoon Labs, Inc.
+# All Rights Reserved
+#-----------------------
+
+import json 
+from json import JSONEncoder
+from bson.objectid import ObjectId
+import logging
+from config import DEBUG
+
+#-------------------------
+def get_logger(log_name=None,log_type='stream'):
+
+    if(DEBUG):
+        log = logging.getLogger(log_name)
+        log.setLevel(logging.DEBUG)
+
+        formatter_stream = logging.Formatter('[%(levelname)s] %(message)s')
+        handler_stream = logging.StreamHandler()
+        handler_stream.setFormatter(formatter_stream)
+
+        formatter_file = logging.Formatter('[%(levelname)s] %(message)s')
+        handler_file = logging.FileHandler('log/debug.log',mode='w')
+        handler_file.setFormatter(formatter_file)
+
+        if(log_type == 'stream'):
+            log.addHandler(handler_stream)
+        elif(log_type == 'file'):
+            log.addHandler(handler_file)
+    else:
+        log = None
+
+    return log
+
+#-------------------------
+#common logger
+log = get_logger()
+
+class MongoEncoder(JSONEncoder):
+    def default(self, obj, **kwargs):
+        if isinstance(obj, ObjectId):
+            return str(obj)
+        else:            
+            return JSONEncoder.default(obj, **kwargs)
+#-------------------------
+def pretty_dump(input):
+
+    return json.dumps(input, cls=MongoEncoder, sort_keys=False, indent=4, separators=(',', ': '))
+
+#-------------------------
+def pretty_print(input):
+    print pretty_dump(input)
+
+#---------------------------------
+def error_reply(msg):
+	reply = {}
+	reply['status'] = -1
+	reply['message'] = "ERROR: " + msg
+	return pretty_dump(reply)
--- a/blockstack_search/crawler/config.py
+++ b/blockstack_search/crawler/config.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+#-----------------------
+# Copyright 2014 Halfmoon Labs, Inc.
+# All Rights Reserved
+#-----------------------
+
+PORT = 5001
+DEBUG = True 
+FG_API_SLUG = '/api/users'
+SUBDOMAINS = ['freegraph','fg']
+SCANPORTS = ['80','5000','8555']
--- a/blockstack_search/crawler/crawler.py
+++ b/blockstack_search/crawler/crawler.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python
+#-----------------------
+# Copyright 2014 Halfmoon Labs, Inc.
+# All Rights Reserved
+#-----------------------
+
+import json
+from flask import Flask, render_template, request
+from common import pretty_dump, error_reply
+import requests
+
+app = Flask(__name__)
+app.config.from_object('config')
+
+from pymongo import MongoClient
+c = MongoClient()
+
+fg = c['freegraph']
+
+#-------------------------
+def get_domain_from_url(url):
+    
+    from urlparse import urlparse
+
+    o = urlparse(url)
+
+    domain = o.hostname
+
+    return domain.lower()
+
+#-------------------------
+def check_host_url_inner(url):
+
+	#headers = {'Content-type': 'application/json', 'Accept': 'text/plain', 'Authorization': 'Basic'}
+
+	print "checking: " + url 
+
+	try:
+		r = requests.get(url)
+	except:
+		return False, None
+
+	print r.status_code
+
+	if(r.status_code == 200):
+		try:
+			data = r.json() 
+		except:
+			return False, None
+
+		if 'users' in data.keys():
+			return True, data 
+	else:
+		return False, None 
+
+#-------------------------
+def check_host_url(domain):
+
+	check_urls = [] 
+	check_servers = []
+
+	check_servers.append(domain)
+
+	for i in app.config['SUBDOMAINS']:
+		check_servers.append(i + '.' + domain)
+
+	for server in check_servers:
+
+		for port in app.config['SCANPORTS']:
+			check_urls.append('http://' + server + ':' + port + app.config['FG_API_SLUG'])
+
+	for url in check_urls:
+		reply, data = check_host_url_inner(url)
+		if(reply):
+			return url, data  
+
+	return False, None 
+
+#-----------------------------------
+@app.route('/')
+def index():
+
+	return render_template('index.html')
+
+#-----------------------------------
+@app.route('/host', methods=['GET'])
+def get_host():
+
+	try:
+		input_url = request.values['url']
+
+		#check if 'http' or 'https' was entered, if not then append 'http' 
+		if((input_url.find('http://') == -1) and (input_url.find('https://') == -1)):
+			input_url = 'http://' + input_url
+		
+	except:
+		return error_reply("No URL given")
+
+	domain = get_domain_from_url(str(input_url))
+
+	host_url, data = check_host_url(domain)
+	nodes = []
+
+	if(host_url is not False):
+		reply = fg.hosts.find_one({'domain':domain})
+
+		if(reply):
+			fg.hosts.remove(reply)
+
+		host = {}
+		host['domain'] = domain 
+		host['host_url'] = host_url
+		host['data'] = data
+		fg.hosts.insert(host)
+
+		nodes = data['users'].keys() 
+
+		print nodes 
+
+		for username in nodes:
+
+			node = {}
+			node['node_url'] = host_url + '/' + username
+
+			reply = fg.nodes.find_one({'node_url':node['node_url']})
+
+			if(reply):
+				fg.nodes.remove(reply)
+			
+			node['data'] = requests.get(node['node_url']).json()
+
+			try:
+				full_name = node['data']['name']['first'].lower() + ' ' + node['data']['name']['last'].lower()
+			except:
+				node['full_name'] = ""
+			else:
+				node['full_name'] = full_name
+
+			fg.nodes.insert(node)
+
+	return render_template('node.html',domain=domain,host_url=host_url,nodes=nodes)
+
+#------------------
+if __name__ == '__main__':
+	app.run(debug=app.config['DEBUG'], port=app.config['PORT'])
--- a/blockstack_search/crawler/discovery.py
+++ b/blockstack_search/crawler/discovery.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+#-----------------------
+# Copyright 2014 Halfmoon Labs, Inc.
+# All Rights Reserved
+#-----------------------
+
+import json
+from flask import Flask, render_template
+from common import pretty_dump, error_reply
+
+app = Flask(__name__)
+
+#-----------------------------------
+@app.route('/')
+def index():
+
+	from datetime import datetime 
+	time = datetime.now()
+	return render_template('discovery.html',time=time.strftime('%X'))
+
+#-----------------------------------
+@app.route('/poll/<string:target>', methods = ['GET'])
+def poll_target(target):
+
+	reply = {}
+
+	blocks = '270941'
+
+	if(target == 'blockchain'):
+		reply['status'] = 1
+		reply['message'] = "Refreshed discovery_queue from source 'bitcoin blockchain'. Latest blocks: " + blocks 
+
+	elif(target == 'crawlindex'):
+		from datetime import datetime, timedelta
+		diff = timedelta(hours=24)
+
+		last_crawled = datetime.now() - diff 
+
+		reply['status'] = 1
+		reply['message'] = "Refreshed discovery_queue from source 'crawl index'. Oldest crawled URL: " + last_crawled.strftime('%Y-%m-%d %X')
+
+	else:
+		reply = "Target '" + target + "' not recognized"
+		return error_reply(reply)
+
+	return pretty_dump(reply)
+
+#-----------------------------------
+@app.errorhandler(500)
+def internal_error(error):
+
+	return error_reply("Something went wrong with the server")
+
+#-----------------------------------
+@app.errorhandler(404)
+def internal_error(error):
+
+	return error_reply('URL not found on this server')
+
+#------------------
+if __name__ == '__main__':
+	app.run(debug=True)
--- a/blockstack_search/crawler/log/debug.log
+++ b/blockstack_search/crawler/log/debug.log
--- a/blockstack_search/crawler/templates/discovery.html
+++ b/blockstack_search/crawler/templates/discovery.html
@@ -0,0 +1,27 @@
+<html>
+<head>
+
+<script type="text/javascript">
+
+//Because the page will automatically refresh, should mention it on the webpage
+
+	function reFresh() {
+  		location.reload(true)
+	}
+
+	// Set the number below to the amount of delay, in milliseconds,
+	//you want between page reloads: 1 minute = 60000 milliseconds.
+	window.setInterval("reFresh()",300000);
+
+</script>
+
+</head>
+
+<body>
+This page refreshes every 5 minutes.<br><br>
+
+Time right now is: {{time}}<br><br>
+ 
+</body>
+
+</html>
--- a/blockstack_search/crawler/templates/index.html
+++ b/blockstack_search/crawler/templates/index.html
@@ -0,0 +1,22 @@
+<html>
+<head>
+</head>
+
+<body>
+	FreeGraph crawler is starting up ... <br><br> 
+	Initializing ...  <br><br>
+
+	Current nodes in the index: <br><br>
+
+	http://halfmoonlabs.com <br>
+	http://cs.princeton.edu <br><br>
+
+	Current users in the index: <br><br>
+
+	Ryan Shea, Halfmoon Labs<br>
+	Muneeb Ali, Halfmoon Labs<br>
+	JP Singh, Princeton CS<br>
+ 
+</body>
+
+</html>
--- a/blockstack_search/crawler/templates/node.html
+++ b/blockstack_search/crawler/templates/node.html
@@ -0,0 +1,13 @@
+<html>
+<head>
+</head>
+
+<body>
+	Checking domain: {{domain}} <br><br>
+	FreeGraph API found: {{host_url}} <br><br>
+
+	Added users (nodes): {% for node in nodes %}{{node}} {% endfor %}
+ 
+</body>
+
+</html>
--- a/blockstack_search/requirements.txt
+++ b/blockstack_search/requirements.txt
@@ -0,0 +1,13 @@
+Flask==0.10.1
+Jinja2==2.7.2
+MarkupSafe==0.18
+Werkzeug==0.9.4
+itsdangerous==0.23
+pyes==0.90.1
+pylibmc==1.2.3
+pymongo==2.6.3
+pytz==2013.9
+requests==2.2.1
+six==1.5.2
+urllib3==1.7.1
+wsgiref==0.1.2
--- a/blockstack_search/search/README.md
+++ b/blockstack_search/search/README.md
@@ -0,0 +1,100 @@
+# Scope Search
+
+We currently have three search sub-systems to handle search queries:
+
+* Substring search on people names
+* Substring search on company names
+* Search on the raw lucene index
+
+We assume that the user is entering either a *person's name* OR a *company's name* in the search query. The API expects an input of the format:
+
+     {
+          "query": "the search query/term",
+          "limit_results": "numeric limit on number of results e.g., 50, this parameter is optional"
+     }
+
+The API returns a JSON object of the format:
+
+     {
+          "companies": [],
+          "people": []
+     }
+
+### Quick Testing
+
+You can test the search API using curl:
+
+> curl http://54.200.33.184/search/api/v1.0/people -G -d "query=peter%20thiel"
+
+OR by using the [test_client.py](test_client.py)
+
+> ./test_client.py "peter thiel"
+
+Make sure that the packages listed in requirements.txt are installed before using the test_client.py
+
+### Search API
+
+#### People API 
+
+The people API can be accessed via: 
+
+> curl http://54.200.33.184/search/api/v1.0/people -G -d "query=peter%20thiel"
+
+This will currently return upto a max of 20 results (can be less depending on the query) with the following data: 
+
+* 'first_name'
+* 'last_name'
+* 'overview' -- overview of the person 
+* 'companies' -- each company has 1) title of person, 2) name of company, and 3) permalink of company
+* 'crunchbase_slug' -- this can be used to get the crunchbase URL as http://www.crunchbase.com/person/ + 'crunchbase_slug' 
+* 'twitter_handle' -- twitter username 
+* 'linkedin_url' -- linkedin URL 
+
+#### Company API 
+
+The company API can be accessed via: 
+
+> curl http://54.200.33.184/search/api/v1.0/company -G -d "query=bank%20simple"
+
+This will currently return upto a max of 20 results (can be less depending on the query) with the following data: 
+     
+* 'name' -- company name 
+* 'homepage_url' -- company website 
+* 'email_address' -- email, if given on crunchbase 
+* 'email_info' -- has information on url_domain, email_domain and if can verify on them
+* 'total_money_raised' -- the total $$ raised
+* 'people' -- list of current employees 
+* 'board' -- list of board members 
+* 'overview' -- overview text from crunchbase
+* 'tag_list' -- combination of tags and categories from crunchbase (crunchbase treats them separately, we don't)
+* 'crunchbase_slug' -- this can be used to get the crunchbase URL as http://www.crunchbase.com/company/ + 'crunchbase_slug'
+* 'offices' -- info on company office(s)
+* 'acquisition' -- if acquired, the year it was acquired in 
+        
+## Installing on UNIX
+
+### Requirements
+
+All required packages for Python are listed in 'requirements.txt'. In addition to those, also requires Elastic Search.
+
+### Elastic Search
+
+Elastic Search library is not in github and resides at
+
+unix/lib/elastic
+
+the current version we're using is *0.90.2*. Download from:
+
+> wget https://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-0.90.2.zip
+
+### Converting RAW data to search index
+
+Right now, the steps required for going from raw data to "ready for searching" are: 
+
+> python scope/datasets/crunchbase/filter_crunchbase_data.py --filter_people  
+> python scope/datasets/crunchbase/filter_crunchbase_data.py --filter_company  
+> python scopesearch/substring_search.py --create_cache  
+> python scopesearch/create_search_index.py --create_people_index  
+> python scopesearch/create_search_index.py --create_company_index
+
+We'll simplify these steps in an upcoming release. We assume that both MongoDB and Elastic Search is running on the server. 
--- a/blockstack_search/search/init.py
+++ b/blockstack_search/search/init.py
--- a/blockstack_search/search/common.py
+++ b/blockstack_search/search/common.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+#-----------------------
+# Copyright 2013 Halfmoon Labs, Inc.
+# All Rights Reserved
+#-----------------------
+
+import json 
+from json import JSONEncoder
+from bson.objectid import ObjectId
+import logging
+from config import DEBUG
+
+#-------------------------
+def get_logger(log_name=None,log_type='stream'):
+
+    if(DEBUG):
+        log = logging.getLogger(log_name)
+        log.setLevel(logging.DEBUG)
+
+        formatter_stream = logging.Formatter('[%(levelname)s] %(message)s')
+        handler_stream = logging.StreamHandler()
+        handler_stream.setFormatter(formatter_stream)
+
+        log.addHandler(handler_stream)
+       
+    else:
+        log = None
+
+    return log
+
+#-------------------------
+#common logger
+log = get_logger()
+
+class MongoEncoder(JSONEncoder):
+    def default(self, obj, **kwargs):
+        if isinstance(obj, ObjectId):
+            return str(obj)
+        else:            
+            return JSONEncoder.default(obj, **kwargs)
+#-------------------------
+def pretty_dump(input):
+
+    return json.dumps(input, cls=MongoEncoder, sort_keys=False, indent=4, separators=(',', ': '))
+
+#-------------------------
+def pretty_print(input):
+    print pretty_dump(input)
+
+#---------------------------------
+def error_reply(msg):
+	reply = {}
+	reply['status'] = -1
+	reply['message'] = "ERROR: " + msg
+	return pretty_dump(reply)
--- a/blockstack_search/search/config.py
+++ b/blockstack_search/search/config.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+#-----------------------
+# Copyright 2014 Halfmoon Labs, Inc.
+# All Rights Reserved
+#-----------------------
+
+PORT = 5001
+DEBUG = True 
+BULK_INSERT_LIMIT = 1000
+DEFAULT_LIMIT = 50
--- a/blockstack_search/search/create_search_index.py
+++ b/blockstack_search/search/create_search_index.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python
+#-----------------------
+# Copyright 2014 Halfmoon Labs, Inc.
+# All Rights Reserved
+#-----------------------
+
+'''
+	functions for building the ES/lucene search index and mappings   
+'''
+import sys
+from pyes import *
+conn =  ES()
+
+from pymongo import MongoClient
+c = MongoClient()
+
+INPUT_OPTIONS = '--create_index --search' 
+
+from config import BULK_INSERT_LIMIT
+from common import log
+
+#-------------------------
+def create_mapping(index_name,index_type):
+
+	'''
+		for creating lucene mapping
+		can add different mappings for different index_types 
+	'''
+
+	try:
+		#delete the old mapping, if exists
+		conn.indices.delete_index(index_name)
+	except:
+		pass
+
+	conn.indices.create_index(index_name)
+
+	mapping = { u'full_name': {'boost': 3.0,
+						'index': 'analyzed',
+						'store': 'yes',
+						'type': u'string',
+						"term_vector" : "with_positions_offsets"},
+
+				u'bio': {'boost': 1.0,
+						'index': 'analyzed',
+						'store': 'yes',
+						'type': u'string',
+						"term_vector" : "with_positions_offsets"},
+
+				u'data': {'boost': 2.0,
+						'index': 'analyzed',
+						'store': 'yes',
+						'type': u'string',
+						"term_vector" : "with_positions_offsets"},}
+
+	conn.indices.put_mapping(index_type, {'properties':mapping}, [index_name])
+
+#-------------------------
+def create_people_index(): 
+
+	create_mapping("fg_people_index","fg_people_type")
+
+	from pymongo import MongoClient
+	from bson import json_util
+	import json 
+
+	c = MongoClient()
+
+	db = c['freegraph']
+	nodes = db.nodes
+
+	counter = 0 
+
+	for i in nodes.find():
+
+		data = i['data']
+
+		print i
+			
+		conn.index({'full_name' : i['data']['name']['full'],
+					'bio' : i['data']['bio'],
+					'data': json.dumps(i['data'], sort_keys=True, default=json_util.default),
+					'_boost' : 1,},
+					"fg_people_index",
+					"fg_people_type",
+					bulk=True)
+
+		counter += 1
+
+		conn.indices.refresh(["fg_people_index"])
+
+		#write in bulk
+		if(counter % BULK_INSERT_LIMIT == 0):
+			print '-' * 5
+			print counter 
+			conn.refresh(["fg_people_index"])
+			
+	conn.indices.force_bulk()
+
+#----------------------------------
+def test_query(query,index=['fg_people_index']):
+
+	q = StringQuery(query, search_fields = ['full_name', 'bio', 'data'], default_operator = 'and')
+	count = conn.count(query = q)
+	count = count.count 
+
+	if(count == 0):
+		q = StringQuery(query, search_fields = ['full_name', 'bio', 'data'], default_operator = 'or')
+	
+	results = conn.search(query = q, size=20, indices=index)
+
+	counter = 0
+
+	results_list = []
+
+	for i in results:
+		counter += 1
+		print i['full_name']
+
+		temp = json.loads(i['data'])
+
+		results_list.append(temp)
+
+	#print counter
+
+	#print results_list 
+
+#-------------------------    
+if __name__ == "__main__":
+
+	try:
+
+		if(len(sys.argv) < 2): 
+			print "Usage error"
+
+		option = sys.argv[1]
+	
+		if(option == '--create_index'):
+			create_people_index()
+		elif(option == '--search'):
+			test_query(query=sys.argv[2])
+		else:
+			print "Usage error"
+			
+	except Exception as e:
+		print e
--- a/blockstack_search/search/search_api.py
+++ b/blockstack_search/search/search_api.py
@@ -0,0 +1,224 @@
+#!/usr/bin/env python
+#-----------------------
+# Copyright 2014 Halfmoon Labs, Inc.
+# All Rights Reserved
+#-----------------------
+
+'''
+	a simple Flask based API for FreeGraph 
+'''
+
+from flask import request, jsonify, Flask
+
+app = Flask(__name__)
+
+import json
+from bson import json_util
+
+DEFAULT_LIMIT = 30
+
+#-----------------------------------
+from pymongo import MongoClient
+c = MongoClient()
+
+import pylibmc
+mc = pylibmc.Client(["127.0.0.1:11211"],binary=True,
+					behaviors={'tcp_nodelay':True,
+								'connect_timeout':100,
+								'no_block':True})
+
+import threading
+
+#-------------------------
+#class for performing multi-threaded search on three search sub-systems
+class QueryThread(threading.Thread):
+	def __init__(self,query,query_type,limit_results):
+		threading.Thread.__init__(self)
+		self.query=query
+		self.query_type=query_type
+		self.results = [] 
+		self.limit_results = limit_results
+		self.found_exact_match = False
+
+	def run(self):
+		if(self.query_type == 'people_search'):
+			self.results = query_people_database(self.query, self.limit_results)
+		elif(self.query_type == 'company_search'):
+			self.found_exact_match, self.results = query_company_database(self.query)
+		elif(self.query_type == 'lucene_search'):
+			self.results = query_lucene_index(self.query,'fg_people_index', self.limit_results)
+
+#-------------------------
+def query_people_database(query,limit_results=DEFAULT_LIMIT):
+
+	'''
+		returns True, {names of employees} if exact match of company name
+		else returns False, [list of possible companies]  
+	'''
+
+	from substring_search import search_people_by_name
+
+	people = search_people_by_name(query, limit_results)
+
+	results = []
+	mongo_query = []
+
+	if people is not None:
+
+		if(len(people) == 0):
+			return results 
+		else:
+			db = c['freegraph']
+
+			#the $in query is much faster but messes up intended results order
+			reply = db.nodes.find({"full_name":{'$in':people}})
+
+			#the reply is a cursor and need to load actual results first
+			for i in reply:
+				results.append(i['data'])
+	
+
+	temp = json.dumps(results, default=json_util.default)
+	return json.loads(temp)
+
+#-----------------------------------
+def query_lucene_index(query,index,limit_results=DEFAULT_LIMIT):
+
+	from pyes import StringQuery, ES 
+	conn =  ES()
+
+	q = StringQuery(query, search_fields = ['full_name', 'bio', 'data'], default_operator = 'and')
+	count = conn.count(query = q)
+	count = count.count 
+
+	#having or gives more results but results quality goes down
+	if(count == 0):
+		q = StringQuery(query, search_fields = ['full_name', 'bio', 'data'], default_operator = 'or')
+
+	results = conn.search(query = q, size=20, indices=[index])
+
+	results_list = []
+
+	counter = 0
+
+	for i in results:
+
+		temp = json.loads(i['data'])
+
+		results_list.append(temp)
+
+		counter += 1
+
+		if(counter == limit_results):
+			break
+
+	return results_list 
+
+#----------------------------------
+def test_alphanumeric(query):
+
+	'''
+		check if query has only alphanumeric characters or not 
+	'''
+
+	import re
+	valid = re.match('^(\w+(\s)*\w*)+$', query) is not None
+
+	#return valid 
+	return True 
+
+#-----------------------------------
+@app.route('/search/people', methods = ['GET'])
+def get_people():
+
+	query = request.values['query']
+	new_limit = DEFAULT_LIMIT
+
+	try:
+		new_limit = int(request.values['limit_results'])
+	except:
+		pass
+
+	'''
+	cache_key = str('scopesearch_cache_' + query.lower())
+	cache_reply = mc.get(cache_key)
+
+	#if a cache hit, respond straight away
+	if(cache_reply != None):
+		return jsonify(cache_reply)
+	'''
+
+	results_people = []
+
+	if test_alphanumeric(query) is False:
+		pass
+	else:
+
+		threads = [] 
+
+		t1 = QueryThread(query,'people_search',new_limit)
+		#t2 = QueryThread(query,'company_search',new_limit)
+		t3 = QueryThread(query,'lucene_search',new_limit)
+
+		threads.append(t1)
+		#threads.append(t2)
+		threads.append(t3)
+
+		#start all threads
+		[x.start() for x in threads]
+
+		#wait for all of them to finish
+		[x.join() for x in threads] 
+
+		#at this point all threads have finished and all queries have been performed
+		
+		
+		#first, check people names
+		people_first_source = t1.results
+		#people_first_source = []
+
+		results_people += people_first_source
+
+		'''
+		#second, check company names
+		found_exact_match, results_second_source = t2.found_exact_match, t2.results 
+
+		#if found exact match then results are people working in that company
+		if(found_exact_match):
+			results_people += results_second_source
+		#else results are list of possible companies
+		else:
+			results_companies = results_second_source 
+
+		'''
+
+		#third, component is lucene results
+		results_lucene = t3.results 
+
+		#lucene results are people 
+		results_people += results_lucene
+
+		'''
+		#dedup all results before sending out
+		from substring_search import dedup_search_results
+		results_people = dedup_search_results(results_people)
+
+		from substring_search import fix_search_order
+		results_people = fix_search_order(query,results_people)
+		'''
+
+	results = {'people':results_people[:new_limit]}
+
+	#mc.set(cache_key,results)
+
+	return jsonify(results)
+
+#-------------------------
+def debug(query):
+
+	return
+
+#------------------
+if __name__ == '__main__':
+
+	app.run(debug=True, port=5003)
--- a/blockstack_search/search/substring_search.py
+++ b/blockstack_search/search/substring_search.py
@@ -0,0 +1,260 @@
+#!/usr/bin/env python
+#-----------------------
+# Copyright 2013 Halfmoon Labs, Inc.
+# All Rights Reserved
+#-----------------------
+
+'''
+	functions for substring search  
+'''
+import sys
+
+from pymongo import MongoClient
+c = MongoClient()
+
+from config import DEFAULT_LIMIT
+
+INPUT_OPTIONS = '--create_cache --search <query>'
+
+#-------------------------
+def create_dedup_names_cache(): 
+	 
+	'''
+		takes people/company names from crunchbase DB and writes deduped names in a 'cache'
+	'''
+
+	fg = c['freegraph']
+
+	#delete any old cache
+	c.drop_database('fg_search_cache')
+
+	search_cache = c['fg_search_cache']
+	people_cache = search_cache.people_cache
+
+	nodes = fg.nodes
+	
+	#------------------------------
+	#for creating people cache 
+
+	counter = 0
+
+	people_names = [] 
+
+	for i in nodes.find():
+
+		counter += 1
+
+		if(counter % 1000 == 0):
+			print counter
+
+		try:
+			name = i['data']['name']['first'].lower() + ' ' + i['data']['name']['last'].lower()  
+		except:
+			pass
+		else:
+			people_names.append(name)
+
+
+	dedup_people_names = list(set(people_names))
+
+	insert_people_names = {'dedup_people_names':dedup_people_names}
+
+	#save final dedup results to mongodb (using it as a cache)
+	people_cache.save(insert_people_names)
+
+	#print '-' * 5
+	#log.debug('Created deduped people_cache: %s from %s', len(dedup_people_names), len(people_names))
+	#log.debug('Creating company cache ...')
+
+	#db.posts.ensure_index('full_name')
+	#log.debug('DONE! All set for searching now.')
+
+#-------------------------
+def anyword_substring_search_inner(query_word,target_words):
+
+	'''
+		return True if ANY target_word matches a query_word 
+	''' 
+
+	for target_word in target_words:
+
+		if(target_word.startswith(query_word)):
+			return query_word
+
+	return False 
+
+#-------------------------
+def anyword_substring_search(target_words,query_words):
+
+	'''
+		return True if all query_words match 
+	'''
+
+	matches_required = len(query_words)
+	matches_found = 0
+
+	for query_word in query_words:
+
+		reply = anyword_substring_search_inner(query_word,target_words) 
+
+		if reply is not False:
+
+			matches_found += 1
+
+		else:
+			#this is imp, otherwise will keep checking when the final answer is already False
+			return False
+
+	if(matches_found == matches_required):
+		return True  
+	else:
+		return False
+
+#-------------------------
+def substring_search(query,list_of_strings,limit_results=DEFAULT_LIMIT): 
+
+	'''
+		main function to call for searching
+	'''
+
+	matching = []
+
+	query_words = query.split(' ')
+	#sort by longest word (higest probability of not finding a match)
+	query_words.sort(key=len, reverse=True)
+
+	counter = 0
+
+	for s in list_of_strings:
+
+		target_words = s.split(' ')
+	
+		#the anyword searching function is separate
+		if(anyword_substring_search(target_words,query_words)):
+			matching.append(s)
+
+			#limit results
+			counter += 1
+			if(counter == limit_results):
+				break
+
+	return matching
+
+#-------------------------
+def search_people_by_name(query,limit_results=DEFAULT_LIMIT):
+
+	query = query.lower()
+
+	#---------------------
+	#using mongodb as a cache, load data in people_names
+	search_cache = c['fg_search_cache']
+
+	people_names = []
+
+	for i in search_cache.people_cache.find():
+		people_names = i['dedup_people_names']
+	#---------------------
+
+	results = substring_search(query,people_names,limit_results)
+
+	return results
+
+#-------------------------
+def fix_search_order(query, search_results):
+
+	results = search_results
+
+	results_names = []
+	old_query = query
+	query = query.split(' ')
+
+	first_word = ''
+	second_word = ''
+	third_word = ''
+
+	if(len(query) < 2):
+		first_word = old_query
+	else:
+		first_word = query[0]
+		second_word = query[1]
+
+		if(len(query) > 2): 
+			third_word = query[2]
+
+	#save results for multiple passes 
+	results_second = []
+	results_third = []
+
+	#------------------------
+	for result in results:
+
+		result_list = result['full_name'].split(' ')
+
+		try:
+			if(result_list[0].startswith(first_word)):
+				results_names.append(result)
+			else:
+				results_second.append(result)
+		except:
+			results_second.append(result)
+
+	#------------------------
+	for result in results_second:
+
+		result_list = result['full_name'].split(' ')
+
+		try:
+			if(result_list[1].startswith(first_word)):
+				results_names.append(result)
+			else:
+				results_third.append(result)
+		except:
+			results_third.append(result)
+	#------------------------
+
+	#results are either in results_names (filtered) or unprocessed in results_third (last pass)
+	return results_names + results_third
+
+#-------------------------
+def dedup_search_results(search_results):
+	'''
+		dedup results based on 'slug'
+	'''
+
+	known_links = set()
+	deduped_results = []
+
+	for i in search_results:
+
+		link = i['url']
+			
+  		if link in known_links: 
+  			continue
+  		
+  		deduped_results.append(i)
+
+  		known_links.add(link)
+
+	return deduped_results
+
+#-------------------------    
+if __name__ == "__main__":
+
+	try:
+
+		if(len(sys.argv) < 2): 
+			print "Usage error"
+
+		option = sys.argv[1]
+
+		if(option == '--create_cache'):
+			create_dedup_names_cache()
+		elif(option == '--search'):
+			query = sys.argv[2]
+			print search_people_by_name(query,DEFAULT_LIMIT)
+
+		else:
+			print "Usage error"
+
+	except Exception as e:
+		print e 
--- a/blockstack_search/search/test_client.py
+++ b/blockstack_search/search/test_client.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+#-----------------------
+# Copyright 2013 Halfmoon Labs, Inc.
+# All Rights Reserved
+#-----------------------
+
+'''
+	For testing the search API from command line 
+'''
+
+import sys
+import requests
+import json  
+
+#-------------------------
+def search_client(query,server):
+
+ 
+ 	print '-' * 10
+ 	print "Searching for: " + query
+	print '-' * 10
+
+	url = 'http://localhost:5000/search/people'
+
+	if(server == 'remote'):
+		url = 'http://54.200.209.148/search/people'
+	
+	print url 
+
+	data = {'query': query, 'limit_results': 35}
+	
+	headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
+
+	r = requests.get(url, params=data, headers=headers)
+
+	print r 
+
+	temp = r.json()
+
+	print '-' * 10
+
+	print "People: "
+
+	for i in temp['people']:
+
+		print i
+		#print i['first_name'] + ' ' + i['last_name'] + ' | ' + 'http://www.crunchbase.com/person/' + i['crunchbase_slug']
+
+	if(len(temp['companies']) > 0):
+
+		print '-' * 10
+		print "Companies: "
+
+		for i in temp['companies']:
+			print i
+		
+	print '-' * 10
+
+#-------------------------    
+if __name__ == "__main__":
+
+	if(len(sys.argv) < 2): print "Error more arguments needed"
+
+	query=sys.argv[1]
+	server = 'local'
+
+	try:
+		server = sys.argv[2] 
+	except:
+		pass
+
+	search_client(query, server)