Added support for doing sub-string search on OneName users.

Currently the index is created from the user DB of OneName.io (blockchain support can be added later)

API returns OneName JSON profiles as results

small update
This commit is contained in:
Muneeb Ali
2014-06-14 22:34:16 -07:00
parent 58f4736d27
commit a1b1918f3f
10 changed files with 141 additions and 86 deletions

View File

@@ -1,4 +1,4 @@
fgsearch
onename-search
========
Search API for FreeGraph
Search API for OneName

View File

@@ -10,4 +10,3 @@ pytz==2014.2
requests==2.2.1
six==1.6.1
urllib3==1.8
wsgiref==0.1.2

0
blockstack_search/search/common.py Executable file → Normal file
View File

7
blockstack_search/search/config.py Executable file → Normal file
View File

@@ -4,7 +4,10 @@
# All Rights Reserved
#-----------------------
PORT = 5001
DEBUG = True
DEBUG = True
DEFAULT_PORT = 5000
DEFAULT_HOST = '127.0.0.1'
BULK_INSERT_LIMIT = 1000
DEFAULT_LIMIT = 50

View File

@@ -14,7 +14,7 @@ from search_api import get_people
from flask import make_response,Response
import json
from bson import json_util
from helpers import *
from rate_limit import *
app = Flask(__name__)
@@ -166,6 +166,7 @@ def not_found(error):
Returns a jsonified 500 error message instead of a HTTP 404 error.
'''
return make_response(jsonify({ 'error': '500 something wrong' }), 500)
#----------------------------------------------
if __name__ == '__main__':
app.run(debug=True, port=5003)

View File

@@ -5,19 +5,19 @@
#-----------------------
'''
a simple Flask based API for OneName
OneName Search
'''
from flask import request, jsonify, Flask
from flask import request, jsonify, Flask, make_response
app = Flask(__name__)
from config import DEFAULT_HOST, DEFAULT_PORT, DEBUG
import json
from bson import json_util
DEFAULT_LIMIT = 30
#-----------------------------------
from pymongo import MongoClient
c = MongoClient()
import sys
from config import DEFAULT_LIMIT
#import pylibmc
"""mc = pylibmc.Client(["127.0.0.1:11211"],binary=True,
@@ -39,15 +39,21 @@ class QueryThread(threading.Thread):
self.found_exact_match = False
def run(self):
#if(self.query_type == 'people_search'):
#self.results = query_people_database(self.query, self.limit_results)
if(self.query_type == 'people_search'):
self.results = query_people_database(self.query, self.limit_results)
#elif(self.query_type == 'company_search'):
#self.found_exact_match, self.results = query_company_database(self.query)
if(self.query_type == 'lucene_search'):
self.results = query_lucene_index(self.query,'onename_people_index', self.limit_results)
#if(self.query_type == 'lucene_search'):
# self.results = query_lucene_index(self.query,'onename_people_index', self.limit_results)
#---------------------------------
def error_reply(msg, code = -1):
reply = {}
reply['status'] = code
reply['message'] = "ERROR: " + msg
return jsonify(reply)
#-------------------------
"""
def query_people_database(query,limit_results=DEFAULT_LIMIT):
'''
@@ -55,29 +61,10 @@ def query_people_database(query,limit_results=DEFAULT_LIMIT):
else returns False, [list of possible companies]
'''
from substring_search import search_people_by_name
from substring_search import search_people_by_name, fetch_profiles_from_names
people = search_people_by_name(query, limit_results)
results = []
mongo_query = []
if people is not None:
if(len(people) == 0):
return results
else:
db = c['onename_search']
#the $in query is much faster but messes up intended results order
reply = db.nodes.find({"details":{'$in':people}})
#the reply is a cursor and need to load actual results first
for i in reply:
results.append(i['data'])
temp = json.dumps(results, default=json_util.default)
return json.loads(temp)
name_search_results = search_people_by_name(query, limit_results)
return fetch_profiles_from_names(name_search_results)
"""
#-----------------------------------
@@ -109,6 +96,7 @@ def query_lucene_index(query,index,limit_results=DEFAULT_LIMIT):
break
return results_list
"""
#----------------------------------
def test_alphanumeric(query):
@@ -124,7 +112,13 @@ def test_alphanumeric(query):
return True
#-----------------------------------
def get_people(query):
@app.route('/search')
def get_people():
query = request.args.get('query')
if query == None:
return error_reply("No query given")
new_limit = DEFAULT_LIMIT
@@ -141,7 +135,7 @@ def get_people(query):
threads = []
t3 = QueryThread(query,'lucene_search',new_limit)
t3 = QueryThread(query,'people_search',new_limit)
threads.append(t3)
@@ -158,16 +152,33 @@ def get_people(query):
results_people += results_lucene
results = {'people':results_people[:new_limit]}
results = {}
results['results'] = results_people[:new_limit]
#print results
#mc.set(cache_key,results)
return jsonify(results)
#-------------------------
def debug(query):
#-----------------------------------
@app.route('/')
def index():
return 'Welcome to the search API server of <a href="http://halfmoonlabs.com">Halfmoon Labs</a>.'
return
#-----------------------------------
@app.errorhandler(500)
def internal_error(error):
#------------------
reply = []
return json.dumps(reply)
#-----------------------------------
@app.errorhandler(404)
def not_found(error):
return make_response(jsonify( { 'error': 'Not found' } ), 404)
#-----------------------------------
if __name__ == '__main__':
app.run(host=DEFAULT_HOST, port=DEFAULT_PORT,debug=DEBUG)

View File

@@ -6,68 +6,86 @@
'''
functions for substring search
usage: './substring_search --create_cache --search <query>'
'''
import sys
import json
from common import log
from pymongo import MongoClient
c = MongoClient()
client = MongoClient()
db = client['onename_user_db']
local_users = db.users
from config import DEFAULT_LIMIT
INPUT_OPTIONS = '--create_cache --search <query>'
#-------------------------
def create_dedup_names_cache():
def create_search_index():
'''
takes people/company names from crunchbase DB and writes deduped names in a 'cache'
takes people names from blockchain and writes deduped names in a 'cache'
'''
fg = c['freegraph']
#delete any old cache/index
client.drop_database('search_db')
client.drop_database('search_cache')
#delete any old cache
c.drop_database('fg_search_cache')
search_db = client['search_db']
search_profiles = search_db.profiles
search_cache = c['fg_search_cache']
people_cache = search_cache.people_cache
search_cache = client['search_cache']
people_cache = search_cache.people
nodes = fg.nodes
#------------------------------
#for creating people cache
# create people name cache
counter = 0
people_names = []
for i in nodes.find():
for user in local_users.find():
search_profile = {}
counter += 1
if(counter % 1000 == 0):
print counter
try:
name = i['data']['name']['first'].lower() + ' ' + i['data']['name']['last'].lower()
except:
pass
else:
profile = json.loads(user['profile'])
if 'name' in profile:
name = profile['name']
try:
name = name['formatted'].lower()
except:
name = name.lower()
people_names.append(name)
#------------------------------
# create index for looking up profiles by people name
dedup_people_names = list(set(people_names))
search_profile['name'] = name
search_profile['profile'] = profile
search_profile['username'] = user['username']
search_profiles.save(search_profile)
insert_people_names = {'dedup_people_names':dedup_people_names}
#dedup names
people_names = list(set(people_names))
people_names = {'people':people_names}
#save final dedup results to mongodb (using it as a cache)
people_cache.save(insert_people_names)
people_cache.save(people_names)
#print '-' * 5
#log.debug('Created deduped people_cache: %s from %s', len(dedup_people_names), len(people_names))
#log.debug('Creating company cache ...')
#db.posts.ensure_index('full_name')
#log.debug('DONE! All set for searching now.')
search_cache.people.ensure_index('people')
search_db.profiles.ensure_index('name')
log.debug('Created people_cache and search_profile index')
#-------------------------
def anyword_substring_search_inner(query_word,target_words):
@@ -147,20 +165,41 @@ def search_people_by_name(query,limit_results=DEFAULT_LIMIT):
#---------------------
#using mongodb as a cache, load data in people_names
search_cache = c['fg_search_cache']
search_cache = client['search_cache']
people_names = []
for i in search_cache.people_cache.find():
people_names = i['dedup_people_names']
for i in search_cache.people.find():
people_names = i['people']
#---------------------
results = substring_search(query,people_names,limit_results)
return results
return order_search_results(query,results)
#-------------------------
def fix_search_order(query, search_results):
def fetch_profiles_from_names(name_search_results):
search_db = client['search_db']
search_profiles = search_db.profiles
results = []
for name in name_search_results:
result = search_profiles.find_one({"name":name})
del result['name']
del result['_id']
results.append(result)
return results
#-------------------------
def order_search_results(query, search_results):
'''
order of results should be a) query in first name, b) query in last name
'''
results = search_results
@@ -188,7 +227,7 @@ def fix_search_order(query, search_results):
#------------------------
for result in results:
result_list = result['full_name'].split(' ')
result_list = result.split(' ')
try:
if(result_list[0].startswith(first_word)):
@@ -201,7 +240,7 @@ def fix_search_order(query, search_results):
#------------------------
for result in results_second:
result_list = result['full_name'].split(' ')
result_list = result.split(' ')
try:
if(result_list[1].startswith(first_word)):
@@ -247,12 +286,14 @@ if __name__ == "__main__":
option = sys.argv[1]
if(option == '--create_cache'):
create_dedup_names_cache()
if(option == '--create_index'):
create_search_index()
elif(option == '--search'):
query = sys.argv[2]
print search_people_by_name(query,DEFAULT_LIMIT)
name_search_results = search_people_by_name(query,DEFAULT_LIMIT)
print name_search_results
print '-' * 5
print fetch_profiles_from_names(name_search_results)
else:
print "Usage error"