first commit

2026-04-30 12:42:10 +08:00 · 2014-01-28 17:31:25 -05:00
parent eb7b92ab44
commit a688562009
19 changed files with 1222 additions and 1 deletions
--- a/blockstack_search/README.md
+++ b/blockstack_search/README.md
@@ -1,4 +1,4 @@
-fgsearch
+nodepath
 ========
 Search API for FreeGraph
--- a/blockstack_search/crawler/init.py
+++ b/blockstack_search/crawler/init.py
--- a/blockstack_search/crawler/common.py
+++ b/blockstack_search/crawler/common.py
@@ -0,0 +1,61 @@
 #!/usr/bin/env python
 #-----------------------
 # Copyright 2013 Halfmoon Labs, Inc.
 # All Rights Reserved
 #-----------------------
 import json 
 from json import JSONEncoder
 from bson.objectid import ObjectId
 import logging
 from config import DEBUG
 #-------------------------
 def get_logger(log_name=None,log_type='stream'):
    if(DEBUG):
        log = logging.getLogger(log_name)
        log.setLevel(logging.DEBUG)
        formatter_stream = logging.Formatter('[%(levelname)s] %(message)s')
        handler_stream = logging.StreamHandler()
        handler_stream.setFormatter(formatter_stream)
        formatter_file = logging.Formatter('[%(levelname)s] %(message)s')
        handler_file = logging.FileHandler('log/debug.log',mode='w')
        handler_file.setFormatter(formatter_file)
        if(log_type == 'stream'):
            log.addHandler(handler_stream)
        elif(log_type == 'file'):
            log.addHandler(handler_file)
    else:
        log = None
    return log
 #-------------------------
 #common logger
 log = get_logger()
 class MongoEncoder(JSONEncoder):
    def default(self, obj, **kwargs):
        if isinstance(obj, ObjectId):
            return str(obj)
        else:            
            return JSONEncoder.default(obj, **kwargs)
 #-------------------------
 def pretty_dump(input):
    return json.dumps(input, cls=MongoEncoder, sort_keys=False, indent=4, separators=(',', ': '))
 #-------------------------
 def pretty_print(input):
    print pretty_dump(input)
 #---------------------------------
 def error_reply(msg):
 	reply = {}
 	reply['status'] = -1
 	reply['message'] = "ERROR: " + msg
 	return pretty_dump(reply)
--- a/blockstack_search/crawler/config.py
+++ b/blockstack_search/crawler/config.py
@@ -0,0 +1,11 @@
 #!/usr/bin/env python
 #-----------------------
 # Copyright 2014 Halfmoon Labs, Inc.
 # All Rights Reserved
 #-----------------------
 PORT = 5001
 DEBUG = True 
 FG_API_SLUG = '/api/users'
 SUBDOMAINS = ['freegraph','fg']
 SCANPORTS = ['80','5000','8555']
--- a/blockstack_search/crawler/crawler.py
+++ b/blockstack_search/crawler/crawler.py
@@ -0,0 +1,145 @@
 #!/usr/bin/env python
 #-----------------------
 # Copyright 2014 Halfmoon Labs, Inc.
 # All Rights Reserved
 #-----------------------
 import json
 from flask import Flask, render_template, request
 from common import pretty_dump, error_reply
 import requests
 app = Flask(__name__)
 app.config.from_object('config')
 from pymongo import MongoClient
 c = MongoClient()
 fg = c['freegraph']
 #-------------------------
 def get_domain_from_url(url):
    from urlparse import urlparse
    o = urlparse(url)
    domain = o.hostname
    return domain.lower()
 #-------------------------
 def check_host_url_inner(url):
 	#headers = {'Content-type': 'application/json', 'Accept': 'text/plain', 'Authorization': 'Basic'}
 	print "checking: " + url 
 	try:
 		r = requests.get(url)
 	except:
 		return False, None
 	print r.status_code
 	if(r.status_code == 200):
 		try:
 			data = r.json() 
 		except:
 			return False, None
 		if 'users' in data.keys():
 			return True, data 
 	else:
 		return False, None 
 #-------------------------
 def check_host_url(domain):
 	check_urls = [] 
 	check_servers = []
 	check_servers.append(domain)
 	for i in app.config['SUBDOMAINS']:
 		check_servers.append(i + '.' + domain)
 	for server in check_servers:
 		for port in app.config['SCANPORTS']:
 			check_urls.append('http://' + server + ':' + port + app.config['FG_API_SLUG'])
 	for url in check_urls:
 		reply, data = check_host_url_inner(url)
 		if(reply):
 			return url, data  
 	return False, None 
 #-----------------------------------
@app.route('/')
 def index():
 	return render_template('index.html')
 #-----------------------------------
@app.route('/host', methods=['GET'])
 def get_host():
 	try:
 		input_url = request.values['url']
 		#check if 'http' or 'https' was entered, if not then append 'http' 
 		if((input_url.find('http://') == -1) and (input_url.find('https://') == -1)):
 			input_url = 'http://' + input_url
 	except:
 		return error_reply("No URL given")
 	domain = get_domain_from_url(str(input_url))
 	host_url, data = check_host_url(domain)
 	nodes = []
 	if(host_url is not False):
 		reply = fg.hosts.find_one({'domain':domain})
 		if(reply):
 			fg.hosts.remove(reply)
 		host = {}
 		host['domain'] = domain 
 		host['host_url'] = host_url
 		host['data'] = data
 		fg.hosts.insert(host)
 		nodes = data['users'].keys() 
 		print nodes 
 		for username in nodes:
 			node = {}
 			node['node_url'] = host_url + '/' + username
 			reply = fg.nodes.find_one({'node_url':node['node_url']})
 			if(reply):
 				fg.nodes.remove(reply)
 			node['data'] = requests.get(node['node_url']).json()
 			try:
 				full_name = node['data']['name']['first'].lower() + ' ' + node['data']['name']['last'].lower()
 			except:
 				node['full_name'] = ""
 			else:
 				node['full_name'] = full_name
 			fg.nodes.insert(node)
 	return render_template('node.html',domain=domain,host_url=host_url,nodes=nodes)
 #------------------
 if __name__ == '__main__':
 	app.run(debug=app.config['DEBUG'], port=app.config['PORT'])
--- a/blockstack_search/crawler/discovery.py
+++ b/blockstack_search/crawler/discovery.py
@@ -0,0 +1,62 @@
 #!/usr/bin/env python
 #-----------------------
 # Copyright 2014 Halfmoon Labs, Inc.
 # All Rights Reserved
 #-----------------------
 import json
 from flask import Flask, render_template
 from common import pretty_dump, error_reply
 app = Flask(__name__)
 #-----------------------------------
@app.route('/')
 def index():
 	from datetime import datetime 
 	time = datetime.now()
 	return render_template('discovery.html',time=time.strftime('%X'))
 #-----------------------------------
@app.route('/poll/<string:target>', methods = ['GET'])
 def poll_target(target):
 	reply = {}
 	blocks = '270941'
 	if(target == 'blockchain'):
 		reply['status'] = 1
 		reply['message'] = "Refreshed discovery_queue from source 'bitcoin blockchain'. Latest blocks: " + blocks 
 	elif(target == 'crawlindex'):
 		from datetime import datetime, timedelta
 		diff = timedelta(hours=24)
 		last_crawled = datetime.now() - diff 
 		reply['status'] = 1
 		reply['message'] = "Refreshed discovery_queue from source 'crawl index'. Oldest crawled URL: " + last_crawled.strftime('%Y-%m-%d %X')
 	else:
 		reply = "Target '" + target + "' not recognized"
 		return error_reply(reply)
 	return pretty_dump(reply)
 #-----------------------------------
@app.errorhandler(500)
 def internal_error(error):
 	return error_reply("Something went wrong with the server")
 #-----------------------------------
@app.errorhandler(404)
 def internal_error(error):
 	return error_reply('URL not found on this server')
 #------------------
 if __name__ == '__main__':
 	app.run(debug=True)
--- a/blockstack_search/crawler/log/debug.log
+++ b/blockstack_search/crawler/log/debug.log
--- a/blockstack_search/crawler/templates/discovery.html
+++ b/blockstack_search/crawler/templates/discovery.html
@@ -0,0 +1,27 @@
 <html>
 <head>
 <script type="text/javascript">
 //Because the page will automatically refresh, should mention it on the webpage
 	function reFresh() {
  		location.reload(true)
 	}
 	// Set the number below to the amount of delay, in milliseconds,
 	//you want between page reloads: 1 minute = 60000 milliseconds.
 	window.setInterval("reFresh()",300000);
 </script>
 </head>
 <body>
 This page refreshes every 5 minutes.<br><br>
 Time right now is: {{time}}<br><br>
 </body>
 </html>
--- a/blockstack_search/crawler/templates/index.html
+++ b/blockstack_search/crawler/templates/index.html
@@ -0,0 +1,22 @@
 <html>
 <head>
 </head>
 <body>
 	FreeGraph crawler is starting up ... <br><br> 
 	Initializing ...  <br><br>
 	Current nodes in the index: <br><br>
 	http://halfmoonlabs.com <br>
 	http://cs.princeton.edu <br><br>
 	Current users in the index: <br><br>
 	Ryan Shea, Halfmoon Labs<br>
 	Muneeb Ali, Halfmoon Labs<br>
 	JP Singh, Princeton CS<br>
 </body>
 </html>
--- a/blockstack_search/crawler/templates/node.html
+++ b/blockstack_search/crawler/templates/node.html
@@ -0,0 +1,13 @@
 <html>
 <head>
 </head>
 <body>
 	Checking domain: {{domain}} <br><br>
 	FreeGraph API found: {{host_url}} <br><br>
 	Added users (nodes): {% for node in nodes %}{{node}} {% endfor %}
 </body>
 </html>
--- a/blockstack_search/requirements.txt
+++ b/blockstack_search/requirements.txt
@@ -0,0 +1,13 @@
 Flask==0.10.1
 Jinja2==2.7.2
 MarkupSafe==0.18
 Werkzeug==0.9.4
 itsdangerous==0.23
 pyes==0.90.1
 pylibmc==1.2.3
 pymongo==2.6.3
 pytz==2013.9
 requests==2.2.1
 six==1.5.2
 urllib3==1.7.1
 wsgiref==0.1.2
--- a/blockstack_search/search/README.md
+++ b/blockstack_search/search/README.md
@@ -0,0 +1,100 @@
 # Scope Search
 We currently have three search sub-systems to handle search queries:
 * Substring search on people names
 * Substring search on company names
 * Search on the raw lucene index
 We assume that the user is entering either a *person's name* OR a *company's name* in the search query. The API expects an input of the format:
     {
          "query": "the search query/term",
          "limit_results": "numeric limit on number of results e.g., 50, this parameter is optional"
     }
 The API returns a JSON object of the format:
     {
          "companies": [],
          "people": []
     }
 ### Quick Testing
 You can test the search API using curl:
 > curl http://54.200.33.184/search/api/v1.0/people -G -d "query=peter%20thiel"
 OR by using the [test_client.py](test_client.py)
 > ./test_client.py "peter thiel"
 Make sure that the packages listed in requirements.txt are installed before using the test_client.py
 ### Search API
 #### People API 
 The people API can be accessed via: 
 > curl http://54.200.33.184/search/api/v1.0/people -G -d "query=peter%20thiel"
 This will currently return upto a max of 20 results (can be less depending on the query) with the following data: 
 * 'first_name'
 * 'last_name'
 * 'overview' -- overview of the person 
 * 'companies' -- each company has 1) title of person, 2) name of company, and 3) permalink of company
 * 'crunchbase_slug' -- this can be used to get the crunchbase URL as http://www.crunchbase.com/person/ + 'crunchbase_slug' 
 * 'twitter_handle' -- twitter username 
 * 'linkedin_url' -- linkedin URL 
 #### Company API 
 The company API can be accessed via: 
 > curl http://54.200.33.184/search/api/v1.0/company -G -d "query=bank%20simple"
 This will currently return upto a max of 20 results (can be less depending on the query) with the following data: 
 * 'name' -- company name 
 * 'homepage_url' -- company website 
 * 'email_address' -- email, if given on crunchbase 
 * 'email_info' -- has information on url_domain, email_domain and if can verify on them
 * 'total_money_raised' -- the total $$ raised
 * 'people' -- list of current employees 
 * 'board' -- list of board members 
 * 'overview' -- overview text from crunchbase
 * 'tag_list' -- combination of tags and categories from crunchbase (crunchbase treats them separately, we don't)
 * 'crunchbase_slug' -- this can be used to get the crunchbase URL as http://www.crunchbase.com/company/ + 'crunchbase_slug'
 * 'offices' -- info on company office(s)
 * 'acquisition' -- if acquired, the year it was acquired in 
 ## Installing on UNIX
 ### Requirements
 All required packages for Python are listed in 'requirements.txt'. In addition to those, also requires Elastic Search.
 ### Elastic Search
 Elastic Search library is not in github and resides at
 unix/lib/elastic
 the current version we're using is *0.90.2*. Download from:
 > wget https://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-0.90.2.zip
 ### Converting RAW data to search index
 Right now, the steps required for going from raw data to "ready for searching" are: 
 > python scope/datasets/crunchbase/filter_crunchbase_data.py --filter_people  
 > python scope/datasets/crunchbase/filter_crunchbase_data.py --filter_company  
 > python scopesearch/substring_search.py --create_cache  
 > python scopesearch/create_search_index.py --create_people_index  
 > python scopesearch/create_search_index.py --create_company_index
 We'll simplify these steps in an upcoming release. We assume that both MongoDB and Elastic Search is running on the server. 
--- a/blockstack_search/search/init.py
+++ b/blockstack_search/search/init.py
--- a/blockstack_search/search/common.py
+++ b/blockstack_search/search/common.py
@@ -0,0 +1,55 @@
 #!/usr/bin/env python
 #-----------------------
 # Copyright 2013 Halfmoon Labs, Inc.
 # All Rights Reserved
 #-----------------------
 import json 
 from json import JSONEncoder
 from bson.objectid import ObjectId
 import logging
 from config import DEBUG
 #-------------------------
 def get_logger(log_name=None,log_type='stream'):
    if(DEBUG):
        log = logging.getLogger(log_name)
        log.setLevel(logging.DEBUG)
        formatter_stream = logging.Formatter('[%(levelname)s] %(message)s')
        handler_stream = logging.StreamHandler()
        handler_stream.setFormatter(formatter_stream)
        log.addHandler(handler_stream)
    else:
        log = None
    return log
 #-------------------------
 #common logger
 log = get_logger()
 class MongoEncoder(JSONEncoder):
    def default(self, obj, **kwargs):
        if isinstance(obj, ObjectId):
            return str(obj)
        else:            
            return JSONEncoder.default(obj, **kwargs)
 #-------------------------
 def pretty_dump(input):
    return json.dumps(input, cls=MongoEncoder, sort_keys=False, indent=4, separators=(',', ': '))
 #-------------------------
 def pretty_print(input):
    print pretty_dump(input)
 #---------------------------------
 def error_reply(msg):
 	reply = {}
 	reply['status'] = -1
 	reply['message'] = "ERROR: " + msg
 	return pretty_dump(reply)
--- a/blockstack_search/search/config.py
+++ b/blockstack_search/search/config.py
@@ -0,0 +1,10 @@
 #!/usr/bin/env python
 #-----------------------
 # Copyright 2014 Halfmoon Labs, Inc.
 # All Rights Reserved
 #-----------------------
 PORT = 5001
 DEBUG = True 
 BULK_INSERT_LIMIT = 1000
 DEFAULT_LIMIT = 50
--- a/blockstack_search/search/create_search_index.py
+++ b/blockstack_search/search/create_search_index.py
@@ -0,0 +1,146 @@
 #!/usr/bin/env python
 #-----------------------
 # Copyright 2014 Halfmoon Labs, Inc.
 # All Rights Reserved
 #-----------------------
 '''
 	functions for building the ES/lucene search index and mappings   
 '''
 import sys
 from pyes import *
 conn =  ES()
 from pymongo import MongoClient
 c = MongoClient()
 INPUT_OPTIONS = '--create_index --search' 
 from config import BULK_INSERT_LIMIT
 from common import log
 #-------------------------
 def create_mapping(index_name,index_type):
 	'''
 		for creating lucene mapping
 		can add different mappings for different index_types 
 	'''
 	try:
 		#delete the old mapping, if exists
 		conn.indices.delete_index(index_name)
 	except:
 		pass
 	conn.indices.create_index(index_name)
 	mapping = { u'full_name': {'boost': 3.0,
 						'index': 'analyzed',
 						'store': 'yes',
 						'type': u'string',
 						"term_vector" : "with_positions_offsets"},
 				u'bio': {'boost': 1.0,
 						'index': 'analyzed',
 						'store': 'yes',
 						'type': u'string',
 						"term_vector" : "with_positions_offsets"},
 				u'data': {'boost': 2.0,
 						'index': 'analyzed',
 						'store': 'yes',
 						'type': u'string',
 						"term_vector" : "with_positions_offsets"},}
 	conn.indices.put_mapping(index_type, {'properties':mapping}, [index_name])
 #-------------------------
 def create_people_index(): 
 	create_mapping("fg_people_index","fg_people_type")
 	from pymongo import MongoClient
 	from bson import json_util
 	import json 
 	c = MongoClient()
 	db = c['freegraph']
 	nodes = db.nodes
 	counter = 0 
 	for i in nodes.find():
 		data = i['data']
 		print i
 		conn.index({'full_name' : i['data']['name']['full'],
 					'bio' : i['data']['bio'],
 					'data': json.dumps(i['data'], sort_keys=True, default=json_util.default),
 					'_boost' : 1,},
 					"fg_people_index",
 					"fg_people_type",
 					bulk=True)
 		counter += 1
 		conn.indices.refresh(["fg_people_index"])
 		#write in bulk
 		if(counter % BULK_INSERT_LIMIT == 0):
 			print '-' * 5
 			print counter 
 			conn.refresh(["fg_people_index"])
 	conn.indices.force_bulk()
 #----------------------------------
 def test_query(query,index=['fg_people_index']):
 	q = StringQuery(query, search_fields = ['full_name', 'bio', 'data'], default_operator = 'and')
 	count = conn.count(query = q)
 	count = count.count 
 	if(count == 0):
 		q = StringQuery(query, search_fields = ['full_name', 'bio', 'data'], default_operator = 'or')
 	results = conn.search(query = q, size=20, indices=index)
 	counter = 0
 	results_list = []
 	for i in results:
 		counter += 1
 		print i['full_name']
 		temp = json.loads(i['data'])
 		results_list.append(temp)
 	#print counter
 	#print results_list 
 #-------------------------    
 if __name__ == "__main__":
 	try:
 		if(len(sys.argv) < 2): 
 			print "Usage error"
 		option = sys.argv[1]
 		if(option == '--create_index'):
 			create_people_index()
 		elif(option == '--search'):
 			test_query(query=sys.argv[2])
 		else:
 			print "Usage error"
 	except Exception as e:
 		print e
--- a/blockstack_search/search/search_api.py
+++ b/blockstack_search/search/search_api.py
@@ -0,0 +1,224 @@
 #!/usr/bin/env python
 #-----------------------
 # Copyright 2014 Halfmoon Labs, Inc.
 # All Rights Reserved
 #-----------------------
 '''
 	a simple Flask based API for FreeGraph 
 '''
 from flask import request, jsonify, Flask
 app = Flask(__name__)
 import json
 from bson import json_util
 DEFAULT_LIMIT = 30
 #-----------------------------------
 from pymongo import MongoClient
 c = MongoClient()
 import pylibmc
 mc = pylibmc.Client(["127.0.0.1:11211"],binary=True,
 					behaviors={'tcp_nodelay':True,
 								'connect_timeout':100,
 								'no_block':True})
 import threading
 #-------------------------
 #class for performing multi-threaded search on three search sub-systems
 class QueryThread(threading.Thread):
 	def __init__(self,query,query_type,limit_results):
 		threading.Thread.__init__(self)
 		self.query=query
 		self.query_type=query_type
 		self.results = [] 
 		self.limit_results = limit_results
 		self.found_exact_match = False
 	def run(self):
 		if(self.query_type == 'people_search'):
 			self.results = query_people_database(self.query, self.limit_results)
 		elif(self.query_type == 'company_search'):
 			self.found_exact_match, self.results = query_company_database(self.query)
 		elif(self.query_type == 'lucene_search'):
 			self.results = query_lucene_index(self.query,'fg_people_index', self.limit_results)
 #-------------------------
 def query_people_database(query,limit_results=DEFAULT_LIMIT):
 	'''
 		returns True, {names of employees} if exact match of company name
 		else returns False, [list of possible companies]  
 	'''
 	from substring_search import search_people_by_name
 	people = search_people_by_name(query, limit_results)
 	results = []
 	mongo_query = []
 	if people is not None:
 		if(len(people) == 0):
 			return results 
 		else:
 			db = c['freegraph']
 			#the $in query is much faster but messes up intended results order
 			reply = db.nodes.find({"full_name":{'$in':people}})
 			#the reply is a cursor and need to load actual results first
 			for i in reply:
 				results.append(i['data'])
 	temp = json.dumps(results, default=json_util.default)
 	return json.loads(temp)
 #-----------------------------------
 def query_lucene_index(query,index,limit_results=DEFAULT_LIMIT):
 	from pyes import StringQuery, ES 
 	conn =  ES()
 	q = StringQuery(query, search_fields = ['full_name', 'bio', 'data'], default_operator = 'and')
 	count = conn.count(query = q)
 	count = count.count 
 	#having or gives more results but results quality goes down
 	if(count == 0):
 		q = StringQuery(query, search_fields = ['full_name', 'bio', 'data'], default_operator = 'or')
 	results = conn.search(query = q, size=20, indices=[index])
 	results_list = []
 	counter = 0
 	for i in results:
 		temp = json.loads(i['data'])
 		results_list.append(temp)
 		counter += 1
 		if(counter == limit_results):
 			break
 	return results_list 
 #----------------------------------
 def test_alphanumeric(query):
 	'''
 		check if query has only alphanumeric characters or not 
 	'''
 	import re
 	valid = re.match('^(\w+(\s)*\w*)+$', query) is not None
 	#return valid 
 	return True 
 #-----------------------------------
@app.route('/search/people', methods = ['GET'])
 def get_people():
 	query = request.values['query']
 	new_limit = DEFAULT_LIMIT
 	try:
 		new_limit = int(request.values['limit_results'])
 	except:
 		pass
 	'''
 	cache_key = str('scopesearch_cache_' + query.lower())
 	cache_reply = mc.get(cache_key)
 	#if a cache hit, respond straight away
 	if(cache_reply != None):
 		return jsonify(cache_reply)
 	'''
 	results_people = []
 	if test_alphanumeric(query) is False:
 		pass
 	else:
 		threads = [] 
 		t1 = QueryThread(query,'people_search',new_limit)
 		#t2 = QueryThread(query,'company_search',new_limit)
 		t3 = QueryThread(query,'lucene_search',new_limit)
 		threads.append(t1)
 		#threads.append(t2)
 		threads.append(t3)
 		#start all threads
 		[x.start() for x in threads]
 		#wait for all of them to finish
 		[x.join() for x in threads] 
 		#at this point all threads have finished and all queries have been performed
 		#first, check people names
 		people_first_source = t1.results
 		#people_first_source = []
 		results_people += people_first_source
 		'''
 		#second, check company names
 		found_exact_match, results_second_source = t2.found_exact_match, t2.results 
 		#if found exact match then results are people working in that company
 		if(found_exact_match):
 			results_people += results_second_source
 		#else results are list of possible companies
 		else:
 			results_companies = results_second_source 
 		'''
 		#third, component is lucene results
 		results_lucene = t3.results 
 		#lucene results are people 
 		results_people += results_lucene
 		'''
 		#dedup all results before sending out
 		from substring_search import dedup_search_results
 		results_people = dedup_search_results(results_people)
 		from substring_search import fix_search_order
 		results_people = fix_search_order(query,results_people)
 		'''
 	results = {'people':results_people[:new_limit]}
 	#mc.set(cache_key,results)
 	return jsonify(results)
 #-------------------------
 def debug(query):
 	return
 #------------------
 if __name__ == '__main__':
 	app.run(debug=True, port=5003)
--- a/blockstack_search/search/substring_search.py
+++ b/blockstack_search/search/substring_search.py
@@ -0,0 +1,260 @@
 #!/usr/bin/env python
 #-----------------------
 # Copyright 2013 Halfmoon Labs, Inc.
 # All Rights Reserved
 #-----------------------
 '''
 	functions for substring search  
 '''
 import sys
 from pymongo import MongoClient
 c = MongoClient()
 from config import DEFAULT_LIMIT
 INPUT_OPTIONS = '--create_cache --search <query>'
 #-------------------------
 def create_dedup_names_cache(): 
 	'''
 		takes people/company names from crunchbase DB and writes deduped names in a 'cache'
 	'''
 	fg = c['freegraph']
 	#delete any old cache
 	c.drop_database('fg_search_cache')
 	search_cache = c['fg_search_cache']
 	people_cache = search_cache.people_cache
 	nodes = fg.nodes
 	#------------------------------
 	#for creating people cache 
 	counter = 0
 	people_names = [] 
 	for i in nodes.find():
 		counter += 1
 		if(counter % 1000 == 0):
 			print counter
 		try:
 			name = i['data']['name']['first'].lower() + ' ' + i['data']['name']['last'].lower()  
 		except:
 			pass
 		else:
 			people_names.append(name)
 	dedup_people_names = list(set(people_names))
 	insert_people_names = {'dedup_people_names':dedup_people_names}
 	#save final dedup results to mongodb (using it as a cache)
 	people_cache.save(insert_people_names)
 	#print '-' * 5
 	#log.debug('Created deduped people_cache: %s from %s', len(dedup_people_names), len(people_names))
 	#log.debug('Creating company cache ...')
 	#db.posts.ensure_index('full_name')
 	#log.debug('DONE! All set for searching now.')
 #-------------------------
 def anyword_substring_search_inner(query_word,target_words):
 	'''
 		return True if ANY target_word matches a query_word 
 	''' 
 	for target_word in target_words:
 		if(target_word.startswith(query_word)):
 			return query_word
 	return False 
 #-------------------------
 def anyword_substring_search(target_words,query_words):
 	'''
 		return True if all query_words match 
 	'''
 	matches_required = len(query_words)
 	matches_found = 0
 	for query_word in query_words:
 		reply = anyword_substring_search_inner(query_word,target_words) 
 		if reply is not False:
 			matches_found += 1
 		else:
 			#this is imp, otherwise will keep checking when the final answer is already False
 			return False
 	if(matches_found == matches_required):
 		return True  
 	else:
 		return False
 #-------------------------
 def substring_search(query,list_of_strings,limit_results=DEFAULT_LIMIT): 
 	'''
 		main function to call for searching
 	'''
 	matching = []
 	query_words = query.split(' ')
 	#sort by longest word (higest probability of not finding a match)
 	query_words.sort(key=len, reverse=True)
 	counter = 0
 	for s in list_of_strings:
 		target_words = s.split(' ')
 		#the anyword searching function is separate
 		if(anyword_substring_search(target_words,query_words)):
 			matching.append(s)
 			#limit results
 			counter += 1
 			if(counter == limit_results):
 				break
 	return matching
 #-------------------------
 def search_people_by_name(query,limit_results=DEFAULT_LIMIT):
 	query = query.lower()
 	#---------------------
 	#using mongodb as a cache, load data in people_names
 	search_cache = c['fg_search_cache']
 	people_names = []
 	for i in search_cache.people_cache.find():
 		people_names = i['dedup_people_names']
 	#---------------------
 	results = substring_search(query,people_names,limit_results)
 	return results
 #-------------------------
 def fix_search_order(query, search_results):
 	results = search_results
 	results_names = []
 	old_query = query
 	query = query.split(' ')
 	first_word = ''
 	second_word = ''
 	third_word = ''
 	if(len(query) < 2):
 		first_word = old_query
 	else:
 		first_word = query[0]
 		second_word = query[1]
 		if(len(query) > 2): 
 			third_word = query[2]
 	#save results for multiple passes 
 	results_second = []
 	results_third = []
 	#------------------------
 	for result in results:
 		result_list = result['full_name'].split(' ')
 		try:
 			if(result_list[0].startswith(first_word)):
 				results_names.append(result)
 			else:
 				results_second.append(result)
 		except:
 			results_second.append(result)
 	#------------------------
 	for result in results_second:
 		result_list = result['full_name'].split(' ')
 		try:
 			if(result_list[1].startswith(first_word)):
 				results_names.append(result)
 			else:
 				results_third.append(result)
 		except:
 			results_third.append(result)
 	#------------------------
 	#results are either in results_names (filtered) or unprocessed in results_third (last pass)
 	return results_names + results_third
 #-------------------------
 def dedup_search_results(search_results):
 	'''
 		dedup results based on 'slug'
 	'''
 	known_links = set()
 	deduped_results = []
 	for i in search_results:
 		link = i['url']
  		if link in known_links: 
  			continue
  		deduped_results.append(i)
  		known_links.add(link)
 	return deduped_results
 #-------------------------    
 if __name__ == "__main__":
 	try:
 		if(len(sys.argv) < 2): 
 			print "Usage error"
 		option = sys.argv[1]
 		if(option == '--create_cache'):
 			create_dedup_names_cache()
 		elif(option == '--search'):
 			query = sys.argv[2]
 			print search_people_by_name(query,DEFAULT_LIMIT)
 		else:
 			print "Usage error"
 	except Exception as e:
 		print e 
--- a/blockstack_search/search/test_client.py
+++ b/blockstack_search/search/test_client.py
@@ -0,0 +1,72 @@
 #!/usr/bin/env python
 #-----------------------
 # Copyright 2013 Halfmoon Labs, Inc.
 # All Rights Reserved
 #-----------------------
 '''
 	For testing the search API from command line 
 '''
 import sys
 import requests
 import json  
 #-------------------------
 def search_client(query,server):
 	print '-' * 10
 	print "Searching for: " + query
 	print '-' * 10
 	url = 'http://localhost:5000/search/people'
 	if(server == 'remote'):
 		url = 'http://54.200.209.148/search/people'
 	print url 
 	data = {'query': query, 'limit_results': 35}
 	headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
 	r = requests.get(url, params=data, headers=headers)
 	print r 
 	temp = r.json()
 	print '-' * 10
 	print "People: "
 	for i in temp['people']:
 		print i
 		#print i['first_name'] + ' ' + i['last_name'] + ' | ' + 'http://www.crunchbase.com/person/' + i['crunchbase_slug']
 	if(len(temp['companies']) > 0):
 		print '-' * 10
 		print "Companies: "
 		for i in temp['companies']:
 			print i
 	print '-' * 10
 #-------------------------    
 if __name__ == "__main__":
 	if(len(sys.argv) < 2): print "Error more arguments needed"
 	query=sys.argv[1]
 	server = 'local'
 	try:
 		server = sys.argv[2] 
 	except:
 		pass
 	search_client(query, server)