mirror of
https://github.com/alexgo-io/stacks-puppet-node.git
synced 2026-01-12 22:43:42 +08:00
first commit
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
fgsearch
|
||||
nodepath
|
||||
========
|
||||
|
||||
Search API for FreeGraph
|
||||
|
||||
0
blockstack_search/crawler/__init__.py
Normal file
0
blockstack_search/crawler/__init__.py
Normal file
61
blockstack_search/crawler/common.py
Executable file
61
blockstack_search/crawler/common.py
Executable file
@@ -0,0 +1,61 @@
|
||||
#!/usr/bin/env python
|
||||
#-----------------------
|
||||
# Copyright 2013 Halfmoon Labs, Inc.
|
||||
# All Rights Reserved
|
||||
#-----------------------
|
||||
|
||||
import json
|
||||
from json import JSONEncoder
|
||||
from bson.objectid import ObjectId
|
||||
import logging
|
||||
from config import DEBUG
|
||||
|
||||
#-------------------------
|
||||
def get_logger(log_name=None,log_type='stream'):
|
||||
|
||||
if(DEBUG):
|
||||
log = logging.getLogger(log_name)
|
||||
log.setLevel(logging.DEBUG)
|
||||
|
||||
formatter_stream = logging.Formatter('[%(levelname)s] %(message)s')
|
||||
handler_stream = logging.StreamHandler()
|
||||
handler_stream.setFormatter(formatter_stream)
|
||||
|
||||
formatter_file = logging.Formatter('[%(levelname)s] %(message)s')
|
||||
handler_file = logging.FileHandler('log/debug.log',mode='w')
|
||||
handler_file.setFormatter(formatter_file)
|
||||
|
||||
if(log_type == 'stream'):
|
||||
log.addHandler(handler_stream)
|
||||
elif(log_type == 'file'):
|
||||
log.addHandler(handler_file)
|
||||
else:
|
||||
log = None
|
||||
|
||||
return log
|
||||
|
||||
#-------------------------
|
||||
#common logger
|
||||
log = get_logger()
|
||||
|
||||
class MongoEncoder(JSONEncoder):
|
||||
def default(self, obj, **kwargs):
|
||||
if isinstance(obj, ObjectId):
|
||||
return str(obj)
|
||||
else:
|
||||
return JSONEncoder.default(obj, **kwargs)
|
||||
#-------------------------
|
||||
def pretty_dump(input):
|
||||
|
||||
return json.dumps(input, cls=MongoEncoder, sort_keys=False, indent=4, separators=(',', ': '))
|
||||
|
||||
#-------------------------
|
||||
def pretty_print(input):
|
||||
print pretty_dump(input)
|
||||
|
||||
#---------------------------------
|
||||
def error_reply(msg):
|
||||
reply = {}
|
||||
reply['status'] = -1
|
||||
reply['message'] = "ERROR: " + msg
|
||||
return pretty_dump(reply)
|
||||
11
blockstack_search/crawler/config.py
Executable file
11
blockstack_search/crawler/config.py
Executable file
@@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env python
|
||||
#-----------------------
|
||||
# Copyright 2014 Halfmoon Labs, Inc.
|
||||
# All Rights Reserved
|
||||
#-----------------------
|
||||
|
||||
PORT = 5001
|
||||
DEBUG = True
|
||||
FG_API_SLUG = '/api/users'
|
||||
SUBDOMAINS = ['freegraph','fg']
|
||||
SCANPORTS = ['80','5000','8555']
|
||||
145
blockstack_search/crawler/crawler.py
Executable file
145
blockstack_search/crawler/crawler.py
Executable file
@@ -0,0 +1,145 @@
|
||||
#!/usr/bin/env python
|
||||
#-----------------------
|
||||
# Copyright 2014 Halfmoon Labs, Inc.
|
||||
# All Rights Reserved
|
||||
#-----------------------
|
||||
|
||||
import json
|
||||
from flask import Flask, render_template, request
|
||||
from common import pretty_dump, error_reply
|
||||
import requests
|
||||
|
||||
app = Flask(__name__)
|
||||
app.config.from_object('config')
|
||||
|
||||
from pymongo import MongoClient
|
||||
c = MongoClient()
|
||||
|
||||
fg = c['freegraph']
|
||||
|
||||
#-------------------------
|
||||
def get_domain_from_url(url):
|
||||
|
||||
from urlparse import urlparse
|
||||
|
||||
o = urlparse(url)
|
||||
|
||||
domain = o.hostname
|
||||
|
||||
return domain.lower()
|
||||
|
||||
#-------------------------
|
||||
def check_host_url_inner(url):
|
||||
|
||||
#headers = {'Content-type': 'application/json', 'Accept': 'text/plain', 'Authorization': 'Basic'}
|
||||
|
||||
print "checking: " + url
|
||||
|
||||
try:
|
||||
r = requests.get(url)
|
||||
except:
|
||||
return False, None
|
||||
|
||||
print r.status_code
|
||||
|
||||
if(r.status_code == 200):
|
||||
try:
|
||||
data = r.json()
|
||||
except:
|
||||
return False, None
|
||||
|
||||
if 'users' in data.keys():
|
||||
return True, data
|
||||
else:
|
||||
return False, None
|
||||
|
||||
#-------------------------
|
||||
def check_host_url(domain):
|
||||
|
||||
check_urls = []
|
||||
check_servers = []
|
||||
|
||||
check_servers.append(domain)
|
||||
|
||||
for i in app.config['SUBDOMAINS']:
|
||||
check_servers.append(i + '.' + domain)
|
||||
|
||||
for server in check_servers:
|
||||
|
||||
for port in app.config['SCANPORTS']:
|
||||
check_urls.append('http://' + server + ':' + port + app.config['FG_API_SLUG'])
|
||||
|
||||
for url in check_urls:
|
||||
reply, data = check_host_url_inner(url)
|
||||
if(reply):
|
||||
return url, data
|
||||
|
||||
return False, None
|
||||
|
||||
#-----------------------------------
|
||||
@app.route('/')
|
||||
def index():
|
||||
|
||||
return render_template('index.html')
|
||||
|
||||
#-----------------------------------
|
||||
@app.route('/host', methods=['GET'])
|
||||
def get_host():
|
||||
|
||||
try:
|
||||
input_url = request.values['url']
|
||||
|
||||
#check if 'http' or 'https' was entered, if not then append 'http'
|
||||
if((input_url.find('http://') == -1) and (input_url.find('https://') == -1)):
|
||||
input_url = 'http://' + input_url
|
||||
|
||||
except:
|
||||
return error_reply("No URL given")
|
||||
|
||||
domain = get_domain_from_url(str(input_url))
|
||||
|
||||
host_url, data = check_host_url(domain)
|
||||
nodes = []
|
||||
|
||||
if(host_url is not False):
|
||||
reply = fg.hosts.find_one({'domain':domain})
|
||||
|
||||
if(reply):
|
||||
fg.hosts.remove(reply)
|
||||
|
||||
host = {}
|
||||
host['domain'] = domain
|
||||
host['host_url'] = host_url
|
||||
host['data'] = data
|
||||
fg.hosts.insert(host)
|
||||
|
||||
nodes = data['users'].keys()
|
||||
|
||||
print nodes
|
||||
|
||||
for username in nodes:
|
||||
|
||||
node = {}
|
||||
node['node_url'] = host_url + '/' + username
|
||||
|
||||
reply = fg.nodes.find_one({'node_url':node['node_url']})
|
||||
|
||||
if(reply):
|
||||
fg.nodes.remove(reply)
|
||||
|
||||
node['data'] = requests.get(node['node_url']).json()
|
||||
|
||||
try:
|
||||
full_name = node['data']['name']['first'].lower() + ' ' + node['data']['name']['last'].lower()
|
||||
except:
|
||||
node['full_name'] = ""
|
||||
else:
|
||||
node['full_name'] = full_name
|
||||
|
||||
fg.nodes.insert(node)
|
||||
|
||||
return render_template('node.html',domain=domain,host_url=host_url,nodes=nodes)
|
||||
|
||||
#------------------
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=app.config['DEBUG'], port=app.config['PORT'])
|
||||
62
blockstack_search/crawler/discovery.py
Executable file
62
blockstack_search/crawler/discovery.py
Executable file
@@ -0,0 +1,62 @@
|
||||
#!/usr/bin/env python
|
||||
#-----------------------
|
||||
# Copyright 2014 Halfmoon Labs, Inc.
|
||||
# All Rights Reserved
|
||||
#-----------------------
|
||||
|
||||
import json
|
||||
from flask import Flask, render_template
|
||||
from common import pretty_dump, error_reply
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
#-----------------------------------
|
||||
@app.route('/')
|
||||
def index():
|
||||
|
||||
from datetime import datetime
|
||||
time = datetime.now()
|
||||
return render_template('discovery.html',time=time.strftime('%X'))
|
||||
|
||||
#-----------------------------------
|
||||
@app.route('/poll/<string:target>', methods = ['GET'])
|
||||
def poll_target(target):
|
||||
|
||||
reply = {}
|
||||
|
||||
blocks = '270941'
|
||||
|
||||
if(target == 'blockchain'):
|
||||
reply['status'] = 1
|
||||
reply['message'] = "Refreshed discovery_queue from source 'bitcoin blockchain'. Latest blocks: " + blocks
|
||||
|
||||
elif(target == 'crawlindex'):
|
||||
from datetime import datetime, timedelta
|
||||
diff = timedelta(hours=24)
|
||||
|
||||
last_crawled = datetime.now() - diff
|
||||
|
||||
reply['status'] = 1
|
||||
reply['message'] = "Refreshed discovery_queue from source 'crawl index'. Oldest crawled URL: " + last_crawled.strftime('%Y-%m-%d %X')
|
||||
|
||||
else:
|
||||
reply = "Target '" + target + "' not recognized"
|
||||
return error_reply(reply)
|
||||
|
||||
return pretty_dump(reply)
|
||||
|
||||
#-----------------------------------
|
||||
@app.errorhandler(500)
|
||||
def internal_error(error):
|
||||
|
||||
return error_reply("Something went wrong with the server")
|
||||
|
||||
#-----------------------------------
|
||||
@app.errorhandler(404)
|
||||
def internal_error(error):
|
||||
|
||||
return error_reply('URL not found on this server')
|
||||
|
||||
#------------------
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True)
|
||||
0
blockstack_search/crawler/log/debug.log
Normal file
0
blockstack_search/crawler/log/debug.log
Normal file
27
blockstack_search/crawler/templates/discovery.html
Normal file
27
blockstack_search/crawler/templates/discovery.html
Normal file
@@ -0,0 +1,27 @@
|
||||
<html>
|
||||
<head>
|
||||
|
||||
<script type="text/javascript">
|
||||
|
||||
//Because the page will automatically refresh, should mention it on the webpage
|
||||
|
||||
function reFresh() {
|
||||
location.reload(true)
|
||||
}
|
||||
|
||||
// Set the number below to the amount of delay, in milliseconds,
|
||||
//you want between page reloads: 1 minute = 60000 milliseconds.
|
||||
window.setInterval("reFresh()",300000);
|
||||
|
||||
</script>
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
This page refreshes every 5 minutes.<br><br>
|
||||
|
||||
Time right now is: {{time}}<br><br>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
||||
22
blockstack_search/crawler/templates/index.html
Normal file
22
blockstack_search/crawler/templates/index.html
Normal file
@@ -0,0 +1,22 @@
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
FreeGraph crawler is starting up ... <br><br>
|
||||
Initializing ... <br><br>
|
||||
|
||||
Current nodes in the index: <br><br>
|
||||
|
||||
http://halfmoonlabs.com <br>
|
||||
http://cs.princeton.edu <br><br>
|
||||
|
||||
Current users in the index: <br><br>
|
||||
|
||||
Ryan Shea, Halfmoon Labs<br>
|
||||
Muneeb Ali, Halfmoon Labs<br>
|
||||
JP Singh, Princeton CS<br>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
||||
13
blockstack_search/crawler/templates/node.html
Normal file
13
blockstack_search/crawler/templates/node.html
Normal file
@@ -0,0 +1,13 @@
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
Checking domain: {{domain}} <br><br>
|
||||
FreeGraph API found: {{host_url}} <br><br>
|
||||
|
||||
Added users (nodes): {% for node in nodes %}{{node}} {% endfor %}
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
||||
13
blockstack_search/requirements.txt
Normal file
13
blockstack_search/requirements.txt
Normal file
@@ -0,0 +1,13 @@
|
||||
Flask==0.10.1
|
||||
Jinja2==2.7.2
|
||||
MarkupSafe==0.18
|
||||
Werkzeug==0.9.4
|
||||
itsdangerous==0.23
|
||||
pyes==0.90.1
|
||||
pylibmc==1.2.3
|
||||
pymongo==2.6.3
|
||||
pytz==2013.9
|
||||
requests==2.2.1
|
||||
six==1.5.2
|
||||
urllib3==1.7.1
|
||||
wsgiref==0.1.2
|
||||
100
blockstack_search/search/README.md
Normal file
100
blockstack_search/search/README.md
Normal file
@@ -0,0 +1,100 @@
|
||||
# Scope Search
|
||||
|
||||
We currently have three search sub-systems to handle search queries:
|
||||
|
||||
* Substring search on people names
|
||||
* Substring search on company names
|
||||
* Search on the raw lucene index
|
||||
|
||||
We assume that the user is entering either a *person's name* OR a *company's name* in the search query. The API expects an input of the format:
|
||||
|
||||
{
|
||||
"query": "the search query/term",
|
||||
"limit_results": "numeric limit on number of results e.g., 50, this parameter is optional"
|
||||
}
|
||||
|
||||
The API returns a JSON object of the format:
|
||||
|
||||
{
|
||||
"companies": [],
|
||||
"people": []
|
||||
}
|
||||
|
||||
### Quick Testing
|
||||
|
||||
You can test the search API using curl:
|
||||
|
||||
> curl http://54.200.33.184/search/api/v1.0/people -G -d "query=peter%20thiel"
|
||||
|
||||
OR by using the [test_client.py](test_client.py)
|
||||
|
||||
> ./test_client.py "peter thiel"
|
||||
|
||||
Make sure that the packages listed in requirements.txt are installed before using the test_client.py
|
||||
|
||||
### Search API
|
||||
|
||||
#### People API
|
||||
|
||||
The people API can be accessed via:
|
||||
|
||||
> curl http://54.200.33.184/search/api/v1.0/people -G -d "query=peter%20thiel"
|
||||
|
||||
This will currently return upto a max of 20 results (can be less depending on the query) with the following data:
|
||||
|
||||
* 'first_name'
|
||||
* 'last_name'
|
||||
* 'overview' -- overview of the person
|
||||
* 'companies' -- each company has 1) title of person, 2) name of company, and 3) permalink of company
|
||||
* 'crunchbase_slug' -- this can be used to get the crunchbase URL as http://www.crunchbase.com/person/ + 'crunchbase_slug'
|
||||
* 'twitter_handle' -- twitter username
|
||||
* 'linkedin_url' -- linkedin URL
|
||||
|
||||
#### Company API
|
||||
|
||||
The company API can be accessed via:
|
||||
|
||||
> curl http://54.200.33.184/search/api/v1.0/company -G -d "query=bank%20simple"
|
||||
|
||||
This will currently return upto a max of 20 results (can be less depending on the query) with the following data:
|
||||
|
||||
* 'name' -- company name
|
||||
* 'homepage_url' -- company website
|
||||
* 'email_address' -- email, if given on crunchbase
|
||||
* 'email_info' -- has information on url_domain, email_domain and if can verify on them
|
||||
* 'total_money_raised' -- the total $$ raised
|
||||
* 'people' -- list of current employees
|
||||
* 'board' -- list of board members
|
||||
* 'overview' -- overview text from crunchbase
|
||||
* 'tag_list' -- combination of tags and categories from crunchbase (crunchbase treats them separately, we don't)
|
||||
* 'crunchbase_slug' -- this can be used to get the crunchbase URL as http://www.crunchbase.com/company/ + 'crunchbase_slug'
|
||||
* 'offices' -- info on company office(s)
|
||||
* 'acquisition' -- if acquired, the year it was acquired in
|
||||
|
||||
## Installing on UNIX
|
||||
|
||||
### Requirements
|
||||
|
||||
All required packages for Python are listed in 'requirements.txt'. In addition to those, also requires Elastic Search.
|
||||
|
||||
### Elastic Search
|
||||
|
||||
Elastic Search library is not in github and resides at
|
||||
|
||||
unix/lib/elastic
|
||||
|
||||
the current version we're using is *0.90.2*. Download from:
|
||||
|
||||
> wget https://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-0.90.2.zip
|
||||
|
||||
### Converting RAW data to search index
|
||||
|
||||
Right now, the steps required for going from raw data to "ready for searching" are:
|
||||
|
||||
> python scope/datasets/crunchbase/filter_crunchbase_data.py --filter_people
|
||||
> python scope/datasets/crunchbase/filter_crunchbase_data.py --filter_company
|
||||
> python scopesearch/substring_search.py --create_cache
|
||||
> python scopesearch/create_search_index.py --create_people_index
|
||||
> python scopesearch/create_search_index.py --create_company_index
|
||||
|
||||
We'll simplify these steps in an upcoming release. We assume that both MongoDB and Elastic Search is running on the server.
|
||||
0
blockstack_search/search/__init__.py
Normal file
0
blockstack_search/search/__init__.py
Normal file
55
blockstack_search/search/common.py
Executable file
55
blockstack_search/search/common.py
Executable file
@@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env python
|
||||
#-----------------------
|
||||
# Copyright 2013 Halfmoon Labs, Inc.
|
||||
# All Rights Reserved
|
||||
#-----------------------
|
||||
|
||||
import json
|
||||
from json import JSONEncoder
|
||||
from bson.objectid import ObjectId
|
||||
import logging
|
||||
from config import DEBUG
|
||||
|
||||
#-------------------------
|
||||
def get_logger(log_name=None,log_type='stream'):
|
||||
|
||||
if(DEBUG):
|
||||
log = logging.getLogger(log_name)
|
||||
log.setLevel(logging.DEBUG)
|
||||
|
||||
formatter_stream = logging.Formatter('[%(levelname)s] %(message)s')
|
||||
handler_stream = logging.StreamHandler()
|
||||
handler_stream.setFormatter(formatter_stream)
|
||||
|
||||
log.addHandler(handler_stream)
|
||||
|
||||
else:
|
||||
log = None
|
||||
|
||||
return log
|
||||
|
||||
#-------------------------
|
||||
#common logger
|
||||
log = get_logger()
|
||||
|
||||
class MongoEncoder(JSONEncoder):
|
||||
def default(self, obj, **kwargs):
|
||||
if isinstance(obj, ObjectId):
|
||||
return str(obj)
|
||||
else:
|
||||
return JSONEncoder.default(obj, **kwargs)
|
||||
#-------------------------
|
||||
def pretty_dump(input):
|
||||
|
||||
return json.dumps(input, cls=MongoEncoder, sort_keys=False, indent=4, separators=(',', ': '))
|
||||
|
||||
#-------------------------
|
||||
def pretty_print(input):
|
||||
print pretty_dump(input)
|
||||
|
||||
#---------------------------------
|
||||
def error_reply(msg):
|
||||
reply = {}
|
||||
reply['status'] = -1
|
||||
reply['message'] = "ERROR: " + msg
|
||||
return pretty_dump(reply)
|
||||
10
blockstack_search/search/config.py
Executable file
10
blockstack_search/search/config.py
Executable file
@@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env python
|
||||
#-----------------------
|
||||
# Copyright 2014 Halfmoon Labs, Inc.
|
||||
# All Rights Reserved
|
||||
#-----------------------
|
||||
|
||||
PORT = 5001
|
||||
DEBUG = True
|
||||
BULK_INSERT_LIMIT = 1000
|
||||
DEFAULT_LIMIT = 50
|
||||
146
blockstack_search/search/create_search_index.py
Executable file
146
blockstack_search/search/create_search_index.py
Executable file
@@ -0,0 +1,146 @@
|
||||
#!/usr/bin/env python
|
||||
#-----------------------
|
||||
# Copyright 2014 Halfmoon Labs, Inc.
|
||||
# All Rights Reserved
|
||||
#-----------------------
|
||||
|
||||
'''
|
||||
functions for building the ES/lucene search index and mappings
|
||||
'''
|
||||
import sys
|
||||
from pyes import *
|
||||
conn = ES()
|
||||
|
||||
from pymongo import MongoClient
|
||||
c = MongoClient()
|
||||
|
||||
INPUT_OPTIONS = '--create_index --search'
|
||||
|
||||
from config import BULK_INSERT_LIMIT
|
||||
from common import log
|
||||
|
||||
#-------------------------
|
||||
def create_mapping(index_name,index_type):
|
||||
|
||||
'''
|
||||
for creating lucene mapping
|
||||
can add different mappings for different index_types
|
||||
'''
|
||||
|
||||
try:
|
||||
#delete the old mapping, if exists
|
||||
conn.indices.delete_index(index_name)
|
||||
except:
|
||||
pass
|
||||
|
||||
conn.indices.create_index(index_name)
|
||||
|
||||
mapping = { u'full_name': {'boost': 3.0,
|
||||
'index': 'analyzed',
|
||||
'store': 'yes',
|
||||
'type': u'string',
|
||||
"term_vector" : "with_positions_offsets"},
|
||||
|
||||
u'bio': {'boost': 1.0,
|
||||
'index': 'analyzed',
|
||||
'store': 'yes',
|
||||
'type': u'string',
|
||||
"term_vector" : "with_positions_offsets"},
|
||||
|
||||
u'data': {'boost': 2.0,
|
||||
'index': 'analyzed',
|
||||
'store': 'yes',
|
||||
'type': u'string',
|
||||
"term_vector" : "with_positions_offsets"},}
|
||||
|
||||
conn.indices.put_mapping(index_type, {'properties':mapping}, [index_name])
|
||||
|
||||
#-------------------------
|
||||
def create_people_index():
|
||||
|
||||
create_mapping("fg_people_index","fg_people_type")
|
||||
|
||||
from pymongo import MongoClient
|
||||
from bson import json_util
|
||||
import json
|
||||
|
||||
c = MongoClient()
|
||||
|
||||
db = c['freegraph']
|
||||
nodes = db.nodes
|
||||
|
||||
counter = 0
|
||||
|
||||
for i in nodes.find():
|
||||
|
||||
data = i['data']
|
||||
|
||||
print i
|
||||
|
||||
conn.index({'full_name' : i['data']['name']['full'],
|
||||
'bio' : i['data']['bio'],
|
||||
'data': json.dumps(i['data'], sort_keys=True, default=json_util.default),
|
||||
'_boost' : 1,},
|
||||
"fg_people_index",
|
||||
"fg_people_type",
|
||||
bulk=True)
|
||||
|
||||
counter += 1
|
||||
|
||||
conn.indices.refresh(["fg_people_index"])
|
||||
|
||||
#write in bulk
|
||||
if(counter % BULK_INSERT_LIMIT == 0):
|
||||
print '-' * 5
|
||||
print counter
|
||||
conn.refresh(["fg_people_index"])
|
||||
|
||||
conn.indices.force_bulk()
|
||||
|
||||
#----------------------------------
|
||||
def test_query(query,index=['fg_people_index']):
|
||||
|
||||
q = StringQuery(query, search_fields = ['full_name', 'bio', 'data'], default_operator = 'and')
|
||||
count = conn.count(query = q)
|
||||
count = count.count
|
||||
|
||||
if(count == 0):
|
||||
q = StringQuery(query, search_fields = ['full_name', 'bio', 'data'], default_operator = 'or')
|
||||
|
||||
results = conn.search(query = q, size=20, indices=index)
|
||||
|
||||
counter = 0
|
||||
|
||||
results_list = []
|
||||
|
||||
for i in results:
|
||||
counter += 1
|
||||
print i['full_name']
|
||||
|
||||
temp = json.loads(i['data'])
|
||||
|
||||
results_list.append(temp)
|
||||
|
||||
#print counter
|
||||
|
||||
#print results_list
|
||||
|
||||
#-------------------------
|
||||
if __name__ == "__main__":
|
||||
|
||||
try:
|
||||
|
||||
if(len(sys.argv) < 2):
|
||||
print "Usage error"
|
||||
|
||||
option = sys.argv[1]
|
||||
|
||||
if(option == '--create_index'):
|
||||
create_people_index()
|
||||
elif(option == '--search'):
|
||||
test_query(query=sys.argv[2])
|
||||
else:
|
||||
print "Usage error"
|
||||
|
||||
except Exception as e:
|
||||
print e
|
||||
224
blockstack_search/search/search_api.py
Executable file
224
blockstack_search/search/search_api.py
Executable file
@@ -0,0 +1,224 @@
|
||||
#!/usr/bin/env python
|
||||
#-----------------------
|
||||
# Copyright 2014 Halfmoon Labs, Inc.
|
||||
# All Rights Reserved
|
||||
#-----------------------
|
||||
|
||||
'''
|
||||
a simple Flask based API for FreeGraph
|
||||
'''
|
||||
|
||||
from flask import request, jsonify, Flask
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
import json
|
||||
from bson import json_util
|
||||
|
||||
DEFAULT_LIMIT = 30
|
||||
|
||||
#-----------------------------------
|
||||
from pymongo import MongoClient
|
||||
c = MongoClient()
|
||||
|
||||
import pylibmc
|
||||
mc = pylibmc.Client(["127.0.0.1:11211"],binary=True,
|
||||
behaviors={'tcp_nodelay':True,
|
||||
'connect_timeout':100,
|
||||
'no_block':True})
|
||||
|
||||
import threading
|
||||
|
||||
#-------------------------
|
||||
#class for performing multi-threaded search on three search sub-systems
|
||||
class QueryThread(threading.Thread):
|
||||
def __init__(self,query,query_type,limit_results):
|
||||
threading.Thread.__init__(self)
|
||||
self.query=query
|
||||
self.query_type=query_type
|
||||
self.results = []
|
||||
self.limit_results = limit_results
|
||||
self.found_exact_match = False
|
||||
|
||||
def run(self):
|
||||
if(self.query_type == 'people_search'):
|
||||
self.results = query_people_database(self.query, self.limit_results)
|
||||
elif(self.query_type == 'company_search'):
|
||||
self.found_exact_match, self.results = query_company_database(self.query)
|
||||
elif(self.query_type == 'lucene_search'):
|
||||
self.results = query_lucene_index(self.query,'fg_people_index', self.limit_results)
|
||||
|
||||
#-------------------------
|
||||
def query_people_database(query,limit_results=DEFAULT_LIMIT):
|
||||
|
||||
'''
|
||||
returns True, {names of employees} if exact match of company name
|
||||
else returns False, [list of possible companies]
|
||||
'''
|
||||
|
||||
from substring_search import search_people_by_name
|
||||
|
||||
people = search_people_by_name(query, limit_results)
|
||||
|
||||
results = []
|
||||
mongo_query = []
|
||||
|
||||
if people is not None:
|
||||
|
||||
if(len(people) == 0):
|
||||
return results
|
||||
else:
|
||||
db = c['freegraph']
|
||||
|
||||
#the $in query is much faster but messes up intended results order
|
||||
reply = db.nodes.find({"full_name":{'$in':people}})
|
||||
|
||||
#the reply is a cursor and need to load actual results first
|
||||
for i in reply:
|
||||
results.append(i['data'])
|
||||
|
||||
|
||||
temp = json.dumps(results, default=json_util.default)
|
||||
return json.loads(temp)
|
||||
|
||||
#-----------------------------------
|
||||
def query_lucene_index(query,index,limit_results=DEFAULT_LIMIT):
|
||||
|
||||
from pyes import StringQuery, ES
|
||||
conn = ES()
|
||||
|
||||
q = StringQuery(query, search_fields = ['full_name', 'bio', 'data'], default_operator = 'and')
|
||||
count = conn.count(query = q)
|
||||
count = count.count
|
||||
|
||||
#having or gives more results but results quality goes down
|
||||
if(count == 0):
|
||||
q = StringQuery(query, search_fields = ['full_name', 'bio', 'data'], default_operator = 'or')
|
||||
|
||||
results = conn.search(query = q, size=20, indices=[index])
|
||||
|
||||
results_list = []
|
||||
|
||||
counter = 0
|
||||
|
||||
for i in results:
|
||||
|
||||
temp = json.loads(i['data'])
|
||||
|
||||
results_list.append(temp)
|
||||
|
||||
counter += 1
|
||||
|
||||
if(counter == limit_results):
|
||||
break
|
||||
|
||||
return results_list
|
||||
|
||||
#----------------------------------
|
||||
def test_alphanumeric(query):
|
||||
|
||||
'''
|
||||
check if query has only alphanumeric characters or not
|
||||
'''
|
||||
|
||||
import re
|
||||
valid = re.match('^(\w+(\s)*\w*)+$', query) is not None
|
||||
|
||||
#return valid
|
||||
return True
|
||||
|
||||
#-----------------------------------
|
||||
@app.route('/search/people', methods = ['GET'])
|
||||
def get_people():
|
||||
|
||||
query = request.values['query']
|
||||
new_limit = DEFAULT_LIMIT
|
||||
|
||||
try:
|
||||
new_limit = int(request.values['limit_results'])
|
||||
except:
|
||||
pass
|
||||
|
||||
'''
|
||||
cache_key = str('scopesearch_cache_' + query.lower())
|
||||
cache_reply = mc.get(cache_key)
|
||||
|
||||
#if a cache hit, respond straight away
|
||||
if(cache_reply != None):
|
||||
return jsonify(cache_reply)
|
||||
'''
|
||||
|
||||
results_people = []
|
||||
|
||||
if test_alphanumeric(query) is False:
|
||||
pass
|
||||
else:
|
||||
|
||||
threads = []
|
||||
|
||||
t1 = QueryThread(query,'people_search',new_limit)
|
||||
#t2 = QueryThread(query,'company_search',new_limit)
|
||||
t3 = QueryThread(query,'lucene_search',new_limit)
|
||||
|
||||
threads.append(t1)
|
||||
#threads.append(t2)
|
||||
threads.append(t3)
|
||||
|
||||
#start all threads
|
||||
[x.start() for x in threads]
|
||||
|
||||
#wait for all of them to finish
|
||||
[x.join() for x in threads]
|
||||
|
||||
#at this point all threads have finished and all queries have been performed
|
||||
|
||||
|
||||
#first, check people names
|
||||
people_first_source = t1.results
|
||||
#people_first_source = []
|
||||
|
||||
results_people += people_first_source
|
||||
|
||||
'''
|
||||
#second, check company names
|
||||
found_exact_match, results_second_source = t2.found_exact_match, t2.results
|
||||
|
||||
#if found exact match then results are people working in that company
|
||||
if(found_exact_match):
|
||||
results_people += results_second_source
|
||||
#else results are list of possible companies
|
||||
else:
|
||||
results_companies = results_second_source
|
||||
|
||||
'''
|
||||
|
||||
#third, component is lucene results
|
||||
results_lucene = t3.results
|
||||
|
||||
#lucene results are people
|
||||
results_people += results_lucene
|
||||
|
||||
'''
|
||||
#dedup all results before sending out
|
||||
from substring_search import dedup_search_results
|
||||
results_people = dedup_search_results(results_people)
|
||||
|
||||
from substring_search import fix_search_order
|
||||
results_people = fix_search_order(query,results_people)
|
||||
'''
|
||||
|
||||
results = {'people':results_people[:new_limit]}
|
||||
|
||||
#mc.set(cache_key,results)
|
||||
|
||||
return jsonify(results)
|
||||
|
||||
#-------------------------
|
||||
def debug(query):
|
||||
|
||||
return
|
||||
|
||||
#------------------
|
||||
if __name__ == '__main__':
|
||||
|
||||
app.run(debug=True, port=5003)
|
||||
260
blockstack_search/search/substring_search.py
Executable file
260
blockstack_search/search/substring_search.py
Executable file
@@ -0,0 +1,260 @@
|
||||
#!/usr/bin/env python
|
||||
#-----------------------
|
||||
# Copyright 2013 Halfmoon Labs, Inc.
|
||||
# All Rights Reserved
|
||||
#-----------------------
|
||||
|
||||
'''
|
||||
functions for substring search
|
||||
'''
|
||||
import sys
|
||||
|
||||
from pymongo import MongoClient
|
||||
c = MongoClient()
|
||||
|
||||
from config import DEFAULT_LIMIT
|
||||
|
||||
INPUT_OPTIONS = '--create_cache --search <query>'
|
||||
|
||||
#-------------------------
|
||||
def create_dedup_names_cache():
|
||||
|
||||
'''
|
||||
takes people/company names from crunchbase DB and writes deduped names in a 'cache'
|
||||
'''
|
||||
|
||||
fg = c['freegraph']
|
||||
|
||||
#delete any old cache
|
||||
c.drop_database('fg_search_cache')
|
||||
|
||||
search_cache = c['fg_search_cache']
|
||||
people_cache = search_cache.people_cache
|
||||
|
||||
nodes = fg.nodes
|
||||
|
||||
#------------------------------
|
||||
#for creating people cache
|
||||
|
||||
counter = 0
|
||||
|
||||
people_names = []
|
||||
|
||||
for i in nodes.find():
|
||||
|
||||
counter += 1
|
||||
|
||||
if(counter % 1000 == 0):
|
||||
print counter
|
||||
|
||||
try:
|
||||
name = i['data']['name']['first'].lower() + ' ' + i['data']['name']['last'].lower()
|
||||
except:
|
||||
pass
|
||||
else:
|
||||
people_names.append(name)
|
||||
|
||||
|
||||
dedup_people_names = list(set(people_names))
|
||||
|
||||
insert_people_names = {'dedup_people_names':dedup_people_names}
|
||||
|
||||
#save final dedup results to mongodb (using it as a cache)
|
||||
people_cache.save(insert_people_names)
|
||||
|
||||
#print '-' * 5
|
||||
#log.debug('Created deduped people_cache: %s from %s', len(dedup_people_names), len(people_names))
|
||||
#log.debug('Creating company cache ...')
|
||||
|
||||
#db.posts.ensure_index('full_name')
|
||||
#log.debug('DONE! All set for searching now.')
|
||||
|
||||
#-------------------------
|
||||
def anyword_substring_search_inner(query_word,target_words):
|
||||
|
||||
'''
|
||||
return True if ANY target_word matches a query_word
|
||||
'''
|
||||
|
||||
for target_word in target_words:
|
||||
|
||||
if(target_word.startswith(query_word)):
|
||||
return query_word
|
||||
|
||||
return False
|
||||
|
||||
#-------------------------
|
||||
def anyword_substring_search(target_words,query_words):
|
||||
|
||||
'''
|
||||
return True if all query_words match
|
||||
'''
|
||||
|
||||
matches_required = len(query_words)
|
||||
matches_found = 0
|
||||
|
||||
for query_word in query_words:
|
||||
|
||||
reply = anyword_substring_search_inner(query_word,target_words)
|
||||
|
||||
if reply is not False:
|
||||
|
||||
matches_found += 1
|
||||
|
||||
else:
|
||||
#this is imp, otherwise will keep checking when the final answer is already False
|
||||
return False
|
||||
|
||||
if(matches_found == matches_required):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
#-------------------------
|
||||
def substring_search(query,list_of_strings,limit_results=DEFAULT_LIMIT):
|
||||
|
||||
'''
|
||||
main function to call for searching
|
||||
'''
|
||||
|
||||
matching = []
|
||||
|
||||
query_words = query.split(' ')
|
||||
#sort by longest word (higest probability of not finding a match)
|
||||
query_words.sort(key=len, reverse=True)
|
||||
|
||||
counter = 0
|
||||
|
||||
for s in list_of_strings:
|
||||
|
||||
target_words = s.split(' ')
|
||||
|
||||
#the anyword searching function is separate
|
||||
if(anyword_substring_search(target_words,query_words)):
|
||||
matching.append(s)
|
||||
|
||||
#limit results
|
||||
counter += 1
|
||||
if(counter == limit_results):
|
||||
break
|
||||
|
||||
return matching
|
||||
|
||||
#-------------------------
|
||||
def search_people_by_name(query,limit_results=DEFAULT_LIMIT):
|
||||
|
||||
query = query.lower()
|
||||
|
||||
#---------------------
|
||||
#using mongodb as a cache, load data in people_names
|
||||
search_cache = c['fg_search_cache']
|
||||
|
||||
people_names = []
|
||||
|
||||
for i in search_cache.people_cache.find():
|
||||
people_names = i['dedup_people_names']
|
||||
#---------------------
|
||||
|
||||
results = substring_search(query,people_names,limit_results)
|
||||
|
||||
return results
|
||||
|
||||
#-------------------------
|
||||
def fix_search_order(query, search_results):
|
||||
|
||||
results = search_results
|
||||
|
||||
results_names = []
|
||||
old_query = query
|
||||
query = query.split(' ')
|
||||
|
||||
first_word = ''
|
||||
second_word = ''
|
||||
third_word = ''
|
||||
|
||||
if(len(query) < 2):
|
||||
first_word = old_query
|
||||
else:
|
||||
first_word = query[0]
|
||||
second_word = query[1]
|
||||
|
||||
if(len(query) > 2):
|
||||
third_word = query[2]
|
||||
|
||||
#save results for multiple passes
|
||||
results_second = []
|
||||
results_third = []
|
||||
|
||||
#------------------------
|
||||
for result in results:
|
||||
|
||||
result_list = result['full_name'].split(' ')
|
||||
|
||||
try:
|
||||
if(result_list[0].startswith(first_word)):
|
||||
results_names.append(result)
|
||||
else:
|
||||
results_second.append(result)
|
||||
except:
|
||||
results_second.append(result)
|
||||
|
||||
#------------------------
|
||||
for result in results_second:
|
||||
|
||||
result_list = result['full_name'].split(' ')
|
||||
|
||||
try:
|
||||
if(result_list[1].startswith(first_word)):
|
||||
results_names.append(result)
|
||||
else:
|
||||
results_third.append(result)
|
||||
except:
|
||||
results_third.append(result)
|
||||
#------------------------
|
||||
|
||||
#results are either in results_names (filtered) or unprocessed in results_third (last pass)
|
||||
return results_names + results_third
|
||||
|
||||
#-------------------------
|
||||
def dedup_search_results(search_results):
|
||||
'''
|
||||
dedup results based on 'slug'
|
||||
'''
|
||||
|
||||
known_links = set()
|
||||
deduped_results = []
|
||||
|
||||
for i in search_results:
|
||||
|
||||
link = i['url']
|
||||
|
||||
if link in known_links:
|
||||
continue
|
||||
|
||||
deduped_results.append(i)
|
||||
|
||||
known_links.add(link)
|
||||
|
||||
return deduped_results
|
||||
|
||||
#-------------------------
|
||||
if __name__ == "__main__":
|
||||
|
||||
try:
|
||||
|
||||
if(len(sys.argv) < 2):
|
||||
print "Usage error"
|
||||
|
||||
option = sys.argv[1]
|
||||
|
||||
if(option == '--create_cache'):
|
||||
create_dedup_names_cache()
|
||||
elif(option == '--search'):
|
||||
query = sys.argv[2]
|
||||
print search_people_by_name(query,DEFAULT_LIMIT)
|
||||
|
||||
else:
|
||||
print "Usage error"
|
||||
|
||||
except Exception as e:
|
||||
print e
|
||||
72
blockstack_search/search/test_client.py
Executable file
72
blockstack_search/search/test_client.py
Executable file
@@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python
|
||||
#-----------------------
|
||||
# Copyright 2013 Halfmoon Labs, Inc.
|
||||
# All Rights Reserved
|
||||
#-----------------------
|
||||
|
||||
'''
|
||||
For testing the search API from command line
|
||||
'''
|
||||
|
||||
import sys
|
||||
import requests
|
||||
import json
|
||||
|
||||
#-------------------------
|
||||
def search_client(query,server):
|
||||
|
||||
|
||||
print '-' * 10
|
||||
print "Searching for: " + query
|
||||
print '-' * 10
|
||||
|
||||
url = 'http://localhost:5000/search/people'
|
||||
|
||||
if(server == 'remote'):
|
||||
url = 'http://54.200.209.148/search/people'
|
||||
|
||||
print url
|
||||
|
||||
data = {'query': query, 'limit_results': 35}
|
||||
|
||||
headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
|
||||
|
||||
r = requests.get(url, params=data, headers=headers)
|
||||
|
||||
print r
|
||||
|
||||
temp = r.json()
|
||||
|
||||
print '-' * 10
|
||||
|
||||
print "People: "
|
||||
|
||||
for i in temp['people']:
|
||||
|
||||
print i
|
||||
#print i['first_name'] + ' ' + i['last_name'] + ' | ' + 'http://www.crunchbase.com/person/' + i['crunchbase_slug']
|
||||
|
||||
if(len(temp['companies']) > 0):
|
||||
|
||||
print '-' * 10
|
||||
print "Companies: "
|
||||
|
||||
for i in temp['companies']:
|
||||
print i
|
||||
|
||||
print '-' * 10
|
||||
|
||||
#-------------------------
|
||||
if __name__ == "__main__":
|
||||
|
||||
if(len(sys.argv) < 2): print "Error more arguments needed"
|
||||
|
||||
query=sys.argv[1]
|
||||
server = 'local'
|
||||
|
||||
try:
|
||||
server = sys.argv[2]
|
||||
except:
|
||||
pass
|
||||
|
||||
search_client(query, server)
|
||||
Reference in New Issue
Block a user