diff --git a/blockstack_search/README.md b/blockstack_search/README.md index 3153e1c1b..33885bf48 100644 --- a/blockstack_search/README.md +++ b/blockstack_search/README.md @@ -1,4 +1,4 @@ -fgsearch +nodepath ======== Search API for FreeGraph diff --git a/blockstack_search/crawler/__init__.py b/blockstack_search/crawler/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/blockstack_search/crawler/common.py b/blockstack_search/crawler/common.py new file mode 100755 index 000000000..54db1a881 --- /dev/null +++ b/blockstack_search/crawler/common.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +#----------------------- +# Copyright 2013 Halfmoon Labs, Inc. +# All Rights Reserved +#----------------------- + +import json +from json import JSONEncoder +from bson.objectid import ObjectId +import logging +from config import DEBUG + +#------------------------- +def get_logger(log_name=None,log_type='stream'): + + if(DEBUG): + log = logging.getLogger(log_name) + log.setLevel(logging.DEBUG) + + formatter_stream = logging.Formatter('[%(levelname)s] %(message)s') + handler_stream = logging.StreamHandler() + handler_stream.setFormatter(formatter_stream) + + formatter_file = logging.Formatter('[%(levelname)s] %(message)s') + handler_file = logging.FileHandler('log/debug.log',mode='w') + handler_file.setFormatter(formatter_file) + + if(log_type == 'stream'): + log.addHandler(handler_stream) + elif(log_type == 'file'): + log.addHandler(handler_file) + else: + log = None + + return log + +#------------------------- +#common logger +log = get_logger() + +class MongoEncoder(JSONEncoder): + def default(self, obj, **kwargs): + if isinstance(obj, ObjectId): + return str(obj) + else: + return JSONEncoder.default(obj, **kwargs) +#------------------------- +def pretty_dump(input): + + return json.dumps(input, cls=MongoEncoder, sort_keys=False, indent=4, separators=(',', ': ')) + +#------------------------- +def pretty_print(input): + print pretty_dump(input) + +#--------------------------------- +def error_reply(msg): + reply = {} + reply['status'] = -1 + reply['message'] = "ERROR: " + msg + return pretty_dump(reply) diff --git a/blockstack_search/crawler/config.py b/blockstack_search/crawler/config.py new file mode 100755 index 000000000..49ac5834d --- /dev/null +++ b/blockstack_search/crawler/config.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +#----------------------- +# Copyright 2014 Halfmoon Labs, Inc. +# All Rights Reserved +#----------------------- + +PORT = 5001 +DEBUG = True +FG_API_SLUG = '/api/users' +SUBDOMAINS = ['freegraph','fg'] +SCANPORTS = ['80','5000','8555'] diff --git a/blockstack_search/crawler/crawler.py b/blockstack_search/crawler/crawler.py new file mode 100755 index 000000000..5d314a90e --- /dev/null +++ b/blockstack_search/crawler/crawler.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python +#----------------------- +# Copyright 2014 Halfmoon Labs, Inc. +# All Rights Reserved +#----------------------- + +import json +from flask import Flask, render_template, request +from common import pretty_dump, error_reply +import requests + +app = Flask(__name__) +app.config.from_object('config') + +from pymongo import MongoClient +c = MongoClient() + +fg = c['freegraph'] + +#------------------------- +def get_domain_from_url(url): + + from urlparse import urlparse + + o = urlparse(url) + + domain = o.hostname + + return domain.lower() + +#------------------------- +def check_host_url_inner(url): + + #headers = {'Content-type': 'application/json', 'Accept': 'text/plain', 'Authorization': 'Basic'} + + print "checking: " + url + + try: + r = requests.get(url) + except: + return False, None + + print r.status_code + + if(r.status_code == 200): + try: + data = r.json() + except: + return False, None + + if 'users' in data.keys(): + return True, data + else: + return False, None + +#------------------------- +def check_host_url(domain): + + check_urls = [] + check_servers = [] + + check_servers.append(domain) + + for i in app.config['SUBDOMAINS']: + check_servers.append(i + '.' + domain) + + for server in check_servers: + + for port in app.config['SCANPORTS']: + check_urls.append('http://' + server + ':' + port + app.config['FG_API_SLUG']) + + for url in check_urls: + reply, data = check_host_url_inner(url) + if(reply): + return url, data + + return False, None + +#----------------------------------- +@app.route('/') +def index(): + + return render_template('index.html') + +#----------------------------------- +@app.route('/host', methods=['GET']) +def get_host(): + + try: + input_url = request.values['url'] + + #check if 'http' or 'https' was entered, if not then append 'http' + if((input_url.find('http://') == -1) and (input_url.find('https://') == -1)): + input_url = 'http://' + input_url + + except: + return error_reply("No URL given") + + domain = get_domain_from_url(str(input_url)) + + host_url, data = check_host_url(domain) + nodes = [] + + if(host_url is not False): + reply = fg.hosts.find_one({'domain':domain}) + + if(reply): + fg.hosts.remove(reply) + + host = {} + host['domain'] = domain + host['host_url'] = host_url + host['data'] = data + fg.hosts.insert(host) + + nodes = data['users'].keys() + + print nodes + + for username in nodes: + + node = {} + node['node_url'] = host_url + '/' + username + + reply = fg.nodes.find_one({'node_url':node['node_url']}) + + if(reply): + fg.nodes.remove(reply) + + node['data'] = requests.get(node['node_url']).json() + + try: + full_name = node['data']['name']['first'].lower() + ' ' + node['data']['name']['last'].lower() + except: + node['full_name'] = "" + else: + node['full_name'] = full_name + + fg.nodes.insert(node) + + return render_template('node.html',domain=domain,host_url=host_url,nodes=nodes) + +#------------------ +if __name__ == '__main__': + app.run(debug=app.config['DEBUG'], port=app.config['PORT']) diff --git a/blockstack_search/crawler/discovery.py b/blockstack_search/crawler/discovery.py new file mode 100755 index 000000000..65048b507 --- /dev/null +++ b/blockstack_search/crawler/discovery.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +#----------------------- +# Copyright 2014 Halfmoon Labs, Inc. +# All Rights Reserved +#----------------------- + +import json +from flask import Flask, render_template +from common import pretty_dump, error_reply + +app = Flask(__name__) + +#----------------------------------- +@app.route('/') +def index(): + + from datetime import datetime + time = datetime.now() + return render_template('discovery.html',time=time.strftime('%X')) + +#----------------------------------- +@app.route('/poll/', methods = ['GET']) +def poll_target(target): + + reply = {} + + blocks = '270941' + + if(target == 'blockchain'): + reply['status'] = 1 + reply['message'] = "Refreshed discovery_queue from source 'bitcoin blockchain'. Latest blocks: " + blocks + + elif(target == 'crawlindex'): + from datetime import datetime, timedelta + diff = timedelta(hours=24) + + last_crawled = datetime.now() - diff + + reply['status'] = 1 + reply['message'] = "Refreshed discovery_queue from source 'crawl index'. Oldest crawled URL: " + last_crawled.strftime('%Y-%m-%d %X') + + else: + reply = "Target '" + target + "' not recognized" + return error_reply(reply) + + return pretty_dump(reply) + +#----------------------------------- +@app.errorhandler(500) +def internal_error(error): + + return error_reply("Something went wrong with the server") + +#----------------------------------- +@app.errorhandler(404) +def internal_error(error): + + return error_reply('URL not found on this server') + +#------------------ +if __name__ == '__main__': + app.run(debug=True) diff --git a/blockstack_search/crawler/log/debug.log b/blockstack_search/crawler/log/debug.log new file mode 100644 index 000000000..e69de29bb diff --git a/blockstack_search/crawler/templates/discovery.html b/blockstack_search/crawler/templates/discovery.html new file mode 100644 index 000000000..7c7756a4b --- /dev/null +++ b/blockstack_search/crawler/templates/discovery.html @@ -0,0 +1,27 @@ + + + + + + + + +This page refreshes every 5 minutes.

+ +Time right now is: {{time}}

+ + + + \ No newline at end of file diff --git a/blockstack_search/crawler/templates/index.html b/blockstack_search/crawler/templates/index.html new file mode 100644 index 000000000..ad56ea607 --- /dev/null +++ b/blockstack_search/crawler/templates/index.html @@ -0,0 +1,22 @@ + + + + + + FreeGraph crawler is starting up ...

+ Initializing ...

+ + Current nodes in the index:

+ + http://halfmoonlabs.com
+ http://cs.princeton.edu

+ + Current users in the index:

+ + Ryan Shea, Halfmoon Labs
+ Muneeb Ali, Halfmoon Labs
+ JP Singh, Princeton CS
+ + + + \ No newline at end of file diff --git a/blockstack_search/crawler/templates/node.html b/blockstack_search/crawler/templates/node.html new file mode 100644 index 000000000..18d745e26 --- /dev/null +++ b/blockstack_search/crawler/templates/node.html @@ -0,0 +1,13 @@ + + + + + + Checking domain: {{domain}}

+ FreeGraph API found: {{host_url}}

+ + Added users (nodes): {% for node in nodes %}{{node}} {% endfor %} + + + + \ No newline at end of file diff --git a/blockstack_search/requirements.txt b/blockstack_search/requirements.txt new file mode 100644 index 000000000..e584966b3 --- /dev/null +++ b/blockstack_search/requirements.txt @@ -0,0 +1,13 @@ +Flask==0.10.1 +Jinja2==2.7.2 +MarkupSafe==0.18 +Werkzeug==0.9.4 +itsdangerous==0.23 +pyes==0.90.1 +pylibmc==1.2.3 +pymongo==2.6.3 +pytz==2013.9 +requests==2.2.1 +six==1.5.2 +urllib3==1.7.1 +wsgiref==0.1.2 diff --git a/blockstack_search/search/README.md b/blockstack_search/search/README.md new file mode 100644 index 000000000..562166544 --- /dev/null +++ b/blockstack_search/search/README.md @@ -0,0 +1,100 @@ +# Scope Search + +We currently have three search sub-systems to handle search queries: + +* Substring search on people names +* Substring search on company names +* Search on the raw lucene index + +We assume that the user is entering either a *person's name* OR a *company's name* in the search query. The API expects an input of the format: + + { + "query": "the search query/term", + "limit_results": "numeric limit on number of results e.g., 50, this parameter is optional" + } + +The API returns a JSON object of the format: + + { + "companies": [], + "people": [] + } + +### Quick Testing + +You can test the search API using curl: + +> curl http://54.200.33.184/search/api/v1.0/people -G -d "query=peter%20thiel" + +OR by using the [test_client.py](test_client.py) + +> ./test_client.py "peter thiel" + +Make sure that the packages listed in requirements.txt are installed before using the test_client.py + +### Search API + +#### People API + +The people API can be accessed via: + +> curl http://54.200.33.184/search/api/v1.0/people -G -d "query=peter%20thiel" + +This will currently return upto a max of 20 results (can be less depending on the query) with the following data: + +* 'first_name' +* 'last_name' +* 'overview' -- overview of the person +* 'companies' -- each company has 1) title of person, 2) name of company, and 3) permalink of company +* 'crunchbase_slug' -- this can be used to get the crunchbase URL as http://www.crunchbase.com/person/ + 'crunchbase_slug' +* 'twitter_handle' -- twitter username +* 'linkedin_url' -- linkedin URL + +#### Company API + +The company API can be accessed via: + +> curl http://54.200.33.184/search/api/v1.0/company -G -d "query=bank%20simple" + +This will currently return upto a max of 20 results (can be less depending on the query) with the following data: + +* 'name' -- company name +* 'homepage_url' -- company website +* 'email_address' -- email, if given on crunchbase +* 'email_info' -- has information on url_domain, email_domain and if can verify on them +* 'total_money_raised' -- the total $$ raised +* 'people' -- list of current employees +* 'board' -- list of board members +* 'overview' -- overview text from crunchbase +* 'tag_list' -- combination of tags and categories from crunchbase (crunchbase treats them separately, we don't) +* 'crunchbase_slug' -- this can be used to get the crunchbase URL as http://www.crunchbase.com/company/ + 'crunchbase_slug' +* 'offices' -- info on company office(s) +* 'acquisition' -- if acquired, the year it was acquired in + +## Installing on UNIX + +### Requirements + +All required packages for Python are listed in 'requirements.txt'. In addition to those, also requires Elastic Search. + +### Elastic Search + +Elastic Search library is not in github and resides at + +unix/lib/elastic + +the current version we're using is *0.90.2*. Download from: + +> wget https://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-0.90.2.zip + +### Converting RAW data to search index + +Right now, the steps required for going from raw data to "ready for searching" are: + +> python scope/datasets/crunchbase/filter_crunchbase_data.py --filter_people +> python scope/datasets/crunchbase/filter_crunchbase_data.py --filter_company +> python scopesearch/substring_search.py --create_cache +> python scopesearch/create_search_index.py --create_people_index +> python scopesearch/create_search_index.py --create_company_index + +We'll simplify these steps in an upcoming release. We assume that both MongoDB and Elastic Search is running on the server. diff --git a/blockstack_search/search/__init__.py b/blockstack_search/search/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/blockstack_search/search/common.py b/blockstack_search/search/common.py new file mode 100755 index 000000000..0f7f9a149 --- /dev/null +++ b/blockstack_search/search/common.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +#----------------------- +# Copyright 2013 Halfmoon Labs, Inc. +# All Rights Reserved +#----------------------- + +import json +from json import JSONEncoder +from bson.objectid import ObjectId +import logging +from config import DEBUG + +#------------------------- +def get_logger(log_name=None,log_type='stream'): + + if(DEBUG): + log = logging.getLogger(log_name) + log.setLevel(logging.DEBUG) + + formatter_stream = logging.Formatter('[%(levelname)s] %(message)s') + handler_stream = logging.StreamHandler() + handler_stream.setFormatter(formatter_stream) + + log.addHandler(handler_stream) + + else: + log = None + + return log + +#------------------------- +#common logger +log = get_logger() + +class MongoEncoder(JSONEncoder): + def default(self, obj, **kwargs): + if isinstance(obj, ObjectId): + return str(obj) + else: + return JSONEncoder.default(obj, **kwargs) +#------------------------- +def pretty_dump(input): + + return json.dumps(input, cls=MongoEncoder, sort_keys=False, indent=4, separators=(',', ': ')) + +#------------------------- +def pretty_print(input): + print pretty_dump(input) + +#--------------------------------- +def error_reply(msg): + reply = {} + reply['status'] = -1 + reply['message'] = "ERROR: " + msg + return pretty_dump(reply) diff --git a/blockstack_search/search/config.py b/blockstack_search/search/config.py new file mode 100755 index 000000000..1b0771a5e --- /dev/null +++ b/blockstack_search/search/config.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +#----------------------- +# Copyright 2014 Halfmoon Labs, Inc. +# All Rights Reserved +#----------------------- + +PORT = 5001 +DEBUG = True +BULK_INSERT_LIMIT = 1000 +DEFAULT_LIMIT = 50 \ No newline at end of file diff --git a/blockstack_search/search/create_search_index.py b/blockstack_search/search/create_search_index.py new file mode 100755 index 000000000..ec4cb19f5 --- /dev/null +++ b/blockstack_search/search/create_search_index.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python +#----------------------- +# Copyright 2014 Halfmoon Labs, Inc. +# All Rights Reserved +#----------------------- + +''' + functions for building the ES/lucene search index and mappings +''' +import sys +from pyes import * +conn = ES() + +from pymongo import MongoClient +c = MongoClient() + +INPUT_OPTIONS = '--create_index --search' + +from config import BULK_INSERT_LIMIT +from common import log + +#------------------------- +def create_mapping(index_name,index_type): + + ''' + for creating lucene mapping + can add different mappings for different index_types + ''' + + try: + #delete the old mapping, if exists + conn.indices.delete_index(index_name) + except: + pass + + conn.indices.create_index(index_name) + + mapping = { u'full_name': {'boost': 3.0, + 'index': 'analyzed', + 'store': 'yes', + 'type': u'string', + "term_vector" : "with_positions_offsets"}, + + u'bio': {'boost': 1.0, + 'index': 'analyzed', + 'store': 'yes', + 'type': u'string', + "term_vector" : "with_positions_offsets"}, + + u'data': {'boost': 2.0, + 'index': 'analyzed', + 'store': 'yes', + 'type': u'string', + "term_vector" : "with_positions_offsets"},} + + conn.indices.put_mapping(index_type, {'properties':mapping}, [index_name]) + +#------------------------- +def create_people_index(): + + create_mapping("fg_people_index","fg_people_type") + + from pymongo import MongoClient + from bson import json_util + import json + + c = MongoClient() + + db = c['freegraph'] + nodes = db.nodes + + counter = 0 + + for i in nodes.find(): + + data = i['data'] + + print i + + conn.index({'full_name' : i['data']['name']['full'], + 'bio' : i['data']['bio'], + 'data': json.dumps(i['data'], sort_keys=True, default=json_util.default), + '_boost' : 1,}, + "fg_people_index", + "fg_people_type", + bulk=True) + + counter += 1 + + conn.indices.refresh(["fg_people_index"]) + + #write in bulk + if(counter % BULK_INSERT_LIMIT == 0): + print '-' * 5 + print counter + conn.refresh(["fg_people_index"]) + + conn.indices.force_bulk() + +#---------------------------------- +def test_query(query,index=['fg_people_index']): + + q = StringQuery(query, search_fields = ['full_name', 'bio', 'data'], default_operator = 'and') + count = conn.count(query = q) + count = count.count + + if(count == 0): + q = StringQuery(query, search_fields = ['full_name', 'bio', 'data'], default_operator = 'or') + + results = conn.search(query = q, size=20, indices=index) + + counter = 0 + + results_list = [] + + for i in results: + counter += 1 + print i['full_name'] + + temp = json.loads(i['data']) + + results_list.append(temp) + + #print counter + + #print results_list + +#------------------------- +if __name__ == "__main__": + + try: + + if(len(sys.argv) < 2): + print "Usage error" + + option = sys.argv[1] + + if(option == '--create_index'): + create_people_index() + elif(option == '--search'): + test_query(query=sys.argv[2]) + else: + print "Usage error" + + except Exception as e: + print e diff --git a/blockstack_search/search/search_api.py b/blockstack_search/search/search_api.py new file mode 100755 index 000000000..5090f7095 --- /dev/null +++ b/blockstack_search/search/search_api.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python +#----------------------- +# Copyright 2014 Halfmoon Labs, Inc. +# All Rights Reserved +#----------------------- + +''' + a simple Flask based API for FreeGraph +''' + +from flask import request, jsonify, Flask + +app = Flask(__name__) + +import json +from bson import json_util + +DEFAULT_LIMIT = 30 + +#----------------------------------- +from pymongo import MongoClient +c = MongoClient() + +import pylibmc +mc = pylibmc.Client(["127.0.0.1:11211"],binary=True, + behaviors={'tcp_nodelay':True, + 'connect_timeout':100, + 'no_block':True}) + +import threading + +#------------------------- +#class for performing multi-threaded search on three search sub-systems +class QueryThread(threading.Thread): + def __init__(self,query,query_type,limit_results): + threading.Thread.__init__(self) + self.query=query + self.query_type=query_type + self.results = [] + self.limit_results = limit_results + self.found_exact_match = False + + def run(self): + if(self.query_type == 'people_search'): + self.results = query_people_database(self.query, self.limit_results) + elif(self.query_type == 'company_search'): + self.found_exact_match, self.results = query_company_database(self.query) + elif(self.query_type == 'lucene_search'): + self.results = query_lucene_index(self.query,'fg_people_index', self.limit_results) + +#------------------------- +def query_people_database(query,limit_results=DEFAULT_LIMIT): + + ''' + returns True, {names of employees} if exact match of company name + else returns False, [list of possible companies] + ''' + + from substring_search import search_people_by_name + + people = search_people_by_name(query, limit_results) + + results = [] + mongo_query = [] + + if people is not None: + + if(len(people) == 0): + return results + else: + db = c['freegraph'] + + #the $in query is much faster but messes up intended results order + reply = db.nodes.find({"full_name":{'$in':people}}) + + #the reply is a cursor and need to load actual results first + for i in reply: + results.append(i['data']) + + + temp = json.dumps(results, default=json_util.default) + return json.loads(temp) + +#----------------------------------- +def query_lucene_index(query,index,limit_results=DEFAULT_LIMIT): + + from pyes import StringQuery, ES + conn = ES() + + q = StringQuery(query, search_fields = ['full_name', 'bio', 'data'], default_operator = 'and') + count = conn.count(query = q) + count = count.count + + #having or gives more results but results quality goes down + if(count == 0): + q = StringQuery(query, search_fields = ['full_name', 'bio', 'data'], default_operator = 'or') + + results = conn.search(query = q, size=20, indices=[index]) + + results_list = [] + + counter = 0 + + for i in results: + + temp = json.loads(i['data']) + + results_list.append(temp) + + counter += 1 + + if(counter == limit_results): + break + + return results_list + +#---------------------------------- +def test_alphanumeric(query): + + ''' + check if query has only alphanumeric characters or not + ''' + + import re + valid = re.match('^(\w+(\s)*\w*)+$', query) is not None + + #return valid + return True + +#----------------------------------- +@app.route('/search/people', methods = ['GET']) +def get_people(): + + query = request.values['query'] + new_limit = DEFAULT_LIMIT + + try: + new_limit = int(request.values['limit_results']) + except: + pass + + ''' + cache_key = str('scopesearch_cache_' + query.lower()) + cache_reply = mc.get(cache_key) + + #if a cache hit, respond straight away + if(cache_reply != None): + return jsonify(cache_reply) + ''' + + results_people = [] + + if test_alphanumeric(query) is False: + pass + else: + + threads = [] + + t1 = QueryThread(query,'people_search',new_limit) + #t2 = QueryThread(query,'company_search',new_limit) + t3 = QueryThread(query,'lucene_search',new_limit) + + threads.append(t1) + #threads.append(t2) + threads.append(t3) + + #start all threads + [x.start() for x in threads] + + #wait for all of them to finish + [x.join() for x in threads] + + #at this point all threads have finished and all queries have been performed + + + #first, check people names + people_first_source = t1.results + #people_first_source = [] + + results_people += people_first_source + + ''' + #second, check company names + found_exact_match, results_second_source = t2.found_exact_match, t2.results + + #if found exact match then results are people working in that company + if(found_exact_match): + results_people += results_second_source + #else results are list of possible companies + else: + results_companies = results_second_source + + ''' + + #third, component is lucene results + results_lucene = t3.results + + #lucene results are people + results_people += results_lucene + + ''' + #dedup all results before sending out + from substring_search import dedup_search_results + results_people = dedup_search_results(results_people) + + from substring_search import fix_search_order + results_people = fix_search_order(query,results_people) + ''' + + results = {'people':results_people[:new_limit]} + + #mc.set(cache_key,results) + + return jsonify(results) + +#------------------------- +def debug(query): + + return + +#------------------ +if __name__ == '__main__': + + app.run(debug=True, port=5003) diff --git a/blockstack_search/search/substring_search.py b/blockstack_search/search/substring_search.py new file mode 100755 index 000000000..82f5fb63b --- /dev/null +++ b/blockstack_search/search/substring_search.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python +#----------------------- +# Copyright 2013 Halfmoon Labs, Inc. +# All Rights Reserved +#----------------------- + +''' + functions for substring search +''' +import sys + +from pymongo import MongoClient +c = MongoClient() + +from config import DEFAULT_LIMIT + +INPUT_OPTIONS = '--create_cache --search ' + +#------------------------- +def create_dedup_names_cache(): + + ''' + takes people/company names from crunchbase DB and writes deduped names in a 'cache' + ''' + + fg = c['freegraph'] + + #delete any old cache + c.drop_database('fg_search_cache') + + search_cache = c['fg_search_cache'] + people_cache = search_cache.people_cache + + nodes = fg.nodes + + #------------------------------ + #for creating people cache + + counter = 0 + + people_names = [] + + for i in nodes.find(): + + counter += 1 + + if(counter % 1000 == 0): + print counter + + try: + name = i['data']['name']['first'].lower() + ' ' + i['data']['name']['last'].lower() + except: + pass + else: + people_names.append(name) + + + dedup_people_names = list(set(people_names)) + + insert_people_names = {'dedup_people_names':dedup_people_names} + + #save final dedup results to mongodb (using it as a cache) + people_cache.save(insert_people_names) + + #print '-' * 5 + #log.debug('Created deduped people_cache: %s from %s', len(dedup_people_names), len(people_names)) + #log.debug('Creating company cache ...') + + #db.posts.ensure_index('full_name') + #log.debug('DONE! All set for searching now.') + +#------------------------- +def anyword_substring_search_inner(query_word,target_words): + + ''' + return True if ANY target_word matches a query_word + ''' + + for target_word in target_words: + + if(target_word.startswith(query_word)): + return query_word + + return False + +#------------------------- +def anyword_substring_search(target_words,query_words): + + ''' + return True if all query_words match + ''' + + matches_required = len(query_words) + matches_found = 0 + + for query_word in query_words: + + reply = anyword_substring_search_inner(query_word,target_words) + + if reply is not False: + + matches_found += 1 + + else: + #this is imp, otherwise will keep checking when the final answer is already False + return False + + if(matches_found == matches_required): + return True + else: + return False + +#------------------------- +def substring_search(query,list_of_strings,limit_results=DEFAULT_LIMIT): + + ''' + main function to call for searching + ''' + + matching = [] + + query_words = query.split(' ') + #sort by longest word (higest probability of not finding a match) + query_words.sort(key=len, reverse=True) + + counter = 0 + + for s in list_of_strings: + + target_words = s.split(' ') + + #the anyword searching function is separate + if(anyword_substring_search(target_words,query_words)): + matching.append(s) + + #limit results + counter += 1 + if(counter == limit_results): + break + + return matching + +#------------------------- +def search_people_by_name(query,limit_results=DEFAULT_LIMIT): + + query = query.lower() + + #--------------------- + #using mongodb as a cache, load data in people_names + search_cache = c['fg_search_cache'] + + people_names = [] + + for i in search_cache.people_cache.find(): + people_names = i['dedup_people_names'] + #--------------------- + + results = substring_search(query,people_names,limit_results) + + return results + +#------------------------- +def fix_search_order(query, search_results): + + results = search_results + + results_names = [] + old_query = query + query = query.split(' ') + + first_word = '' + second_word = '' + third_word = '' + + if(len(query) < 2): + first_word = old_query + else: + first_word = query[0] + second_word = query[1] + + if(len(query) > 2): + third_word = query[2] + + #save results for multiple passes + results_second = [] + results_third = [] + + #------------------------ + for result in results: + + result_list = result['full_name'].split(' ') + + try: + if(result_list[0].startswith(first_word)): + results_names.append(result) + else: + results_second.append(result) + except: + results_second.append(result) + + #------------------------ + for result in results_second: + + result_list = result['full_name'].split(' ') + + try: + if(result_list[1].startswith(first_word)): + results_names.append(result) + else: + results_third.append(result) + except: + results_third.append(result) + #------------------------ + + #results are either in results_names (filtered) or unprocessed in results_third (last pass) + return results_names + results_third + +#------------------------- +def dedup_search_results(search_results): + ''' + dedup results based on 'slug' + ''' + + known_links = set() + deduped_results = [] + + for i in search_results: + + link = i['url'] + + if link in known_links: + continue + + deduped_results.append(i) + + known_links.add(link) + + return deduped_results + +#------------------------- +if __name__ == "__main__": + + try: + + if(len(sys.argv) < 2): + print "Usage error" + + option = sys.argv[1] + + if(option == '--create_cache'): + create_dedup_names_cache() + elif(option == '--search'): + query = sys.argv[2] + print search_people_by_name(query,DEFAULT_LIMIT) + + else: + print "Usage error" + + except Exception as e: + print e diff --git a/blockstack_search/search/test_client.py b/blockstack_search/search/test_client.py new file mode 100755 index 000000000..81d16c2a0 --- /dev/null +++ b/blockstack_search/search/test_client.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python +#----------------------- +# Copyright 2013 Halfmoon Labs, Inc. +# All Rights Reserved +#----------------------- + +''' + For testing the search API from command line +''' + +import sys +import requests +import json + +#------------------------- +def search_client(query,server): + + + print '-' * 10 + print "Searching for: " + query + print '-' * 10 + + url = 'http://localhost:5000/search/people' + + if(server == 'remote'): + url = 'http://54.200.209.148/search/people' + + print url + + data = {'query': query, 'limit_results': 35} + + headers = {'Content-type': 'application/json', 'Accept': 'text/plain'} + + r = requests.get(url, params=data, headers=headers) + + print r + + temp = r.json() + + print '-' * 10 + + print "People: " + + for i in temp['people']: + + print i + #print i['first_name'] + ' ' + i['last_name'] + ' | ' + 'http://www.crunchbase.com/person/' + i['crunchbase_slug'] + + if(len(temp['companies']) > 0): + + print '-' * 10 + print "Companies: " + + for i in temp['companies']: + print i + + print '-' * 10 + +#------------------------- +if __name__ == "__main__": + + if(len(sys.argv) < 2): print "Error more arguments needed" + + query=sys.argv[1] + server = 'local' + + try: + server = sys.argv[2] + except: + pass + + search_client(query, server) \ No newline at end of file