merge in search updates from master

This commit is contained in:
Aaron Blankstein
2017-08-23 15:22:08 -04:00
7 changed files with 232 additions and 165 deletions

View File

@@ -48,10 +48,8 @@ def fetch_profile_data_from_file():
""" takes profile data from file and saves in the profile_data DB
"""
profile_data_file = open(SEARCH_PROFILE_DATA_FILE, 'r')
profiles = profile_data_file.read()
profiles = json.loads(profiles)
with open(SEARCH_PROFILE_DATA_FILE, 'r') as fin:
profiles = json.load(fin)
counter = 0
@@ -59,7 +57,6 @@ def fetch_profile_data_from_file():
log.debug("Fetching profile data from file")
for entry in profiles:
new_entry = {}
new_entry['key'] = entry['fqu']
new_entry['value'] = entry['profile']
@@ -67,16 +64,14 @@ def fetch_profile_data_from_file():
try:
profile_data.save(new_entry)
except Exception as e:
print e
print "Exception on entry {}".format(new_entry)
log.exception(e)
log.error("Exception on entry {}".format(new_entry))
counter += 1
if counter % 1000 == 0:
log.debug("Processed entries: %s" % counter)
profile_data_file.close()
profile_data.ensure_index('key')
return
@@ -160,7 +155,6 @@ def create_search_index():
log.debug("Creating search index")
for user in namespace.find():
# the profile/info to be inserted
search_profile = {}
@@ -179,19 +173,17 @@ def create_search_index():
hasBazaarId=False
# search for openbazaar id in the profile
# search for openbazaar id in the profile
if 'account' in profile:
for accounts in profile['account']:
if accounts['service'] == 'openbazaar':
hasBazaarId = True
search_profile['openbazaar']=accounts['identifier']
# pretty_print(search_profile['openbazaar'])
if (hasBazaarId == False):
search_profile['openbazaar'] = None
if 'name' in profile:
if 'name' in profile:
try:
name = profile['name']
except:
@@ -201,7 +193,6 @@ def create_search_index():
name = name['formatted'].lower()
except:
name = name.lower()
people_names.append(name)
search_profile['name'] = name
@@ -209,7 +200,6 @@ def create_search_index():
search_profile['name'] = None
if 'twitter' in profile:
twitter_handle = profile['twitter']
try:
@@ -232,9 +222,7 @@ def create_search_index():
search_profile['profile'] = profile
search_profiles.save(search_profile)
# dedup names
people_names = list(set(people_names))
people_names = {'name': people_names}
@@ -246,9 +234,6 @@ def create_search_index():
usernames = {'username': usernames}
# save final dedup results to mongodb (using it as a cache)
people_cache.save(people_names)
twitter_cache.save(twitter_handles)
username_cache.save(usernames)
@@ -257,7 +242,6 @@ def create_search_index():
log.debug('Created name/twitter/username search index')
if __name__ == "__main__":
if(len(sys.argv) < 2):

View File

@@ -35,7 +35,7 @@ from api.config import (
from .utils import validUsername
from .utils import get_json, config_log
from blockstack_client import proxy
from blockstack_client import proxy, subdomains
from blockstack_client.profile import get_profile
from api.utils import profile_log
import logging
@@ -49,8 +49,13 @@ def fetch_namespace():
"""
resp = proxy.get_all_names()
subdomaindb = subdomains.SubdomainDB()
subdomain_names = subdomaindb.get_all_subdomains()
all_names = list(resp) + list(subdomain_names)
with open(SEARCH_BLOCKCHAIN_DATA_FILE, 'w') as fout:
fout.write(json.dumps(resp))
fout.write(json.dumps(all_names))
def print_status_bar(filled, total):
pct = float(filled) / total
@@ -67,6 +72,7 @@ def update_profiles():
last_block_processed = search_indexer_info['last_block_height']
last_full_index = search_indexer_info['last_full_index']
last_subdomain_seq = search_indexer_info['last_subdomain_seq']
info_resp = proxy.getinfo()
try:
@@ -75,28 +81,40 @@ def update_profiles():
print info_resp
raise
with open(SEARCH_BLOCKCHAIN_DATA_FILE, 'r') as fin:
existing_names = set(json.load(fin))
if last_block_processed - 1 > new_block_height:
return {'status' : True, 'message' : 'No new blocks since last indexing'}
subdomaindb = subdomains.SubdomainDB()
subdomain_names = [ name for name in
subdomaindb.get_all_subdomains(above_seq = last_subdomain_seq)
if name not in existing_names ]
last_subdomain_seq = subdomaindb.get_last_index()
# aaron: note, sometimes it may take a little while for
# new zonefiles to have propagated to the network, so
# we over-fetch a little bit
zonefiles_resp = proxy.get_zonefiles_by_block(
last_block_processed - 1, new_block_height)
zonefiles_updated = zonefiles_resp['zonefile_info']
names_updated = set(
[ zf_info['name'] for zf_info in zonefiles_updated
if 'name' in zf_info ])
names_updated = [ zf_info['name'] for zf_info in zonefiles_updated
if 'name' in zf_info ]
names_updated += subdomain_names
names_to_insert = set([ name for name in names_updated if name not in existing_names ])
updated_profiles = {}
actually_updated_names = set()
print "Updating {} entries...".format(len(names_updated))
for ix, name in enumerate(names_updated):
print_status_bar(ix+1, len(names_updated))
for ix, name in enumerate(names_to_insert):
print_status_bar(ix+1, len(names_to_insert))
profile_entry = {}
profile_entry['fqu'] = name
try:
profile_entry['profile'] = get_profile(name, use_legacy = True)['profile']
profile_resp = get_profile(name, use_legacy = True)
profile_entry['profile'] = profile_resp['profile']
updated_profiles[name] = (profile_entry)
actually_updated_names.add(name)
except KeyboardInterrupt as e:
@@ -105,13 +123,17 @@ def update_profiles():
import traceback as tb; tb.print_exc()
names_updated = actually_updated_names
if len(names_updated) == 0:
return {'status' : True, 'message' : 'No new profiles'}
with open(SEARCH_PROFILE_DATA_FILE, 'r') as fin:
all_profiles = json.load(fin)
to_remove = []
for ix, profile in enumerate(all_profiles):
if profile['fqu'] in names_updated:
all_profiles[ix] = updated_profiles[profile['fqu']]
existing_names = list(existing_names)
for name_to_add in names_updated:
all_profiles.append(updated_profiles[name_to_add])
existing_names.append(name_to_add)
if not obtain_lockfile():
return {'error' : 'Could not obtain lockfile, abandoning my update.'}
@@ -120,10 +142,13 @@ def update_profiles():
if search_indexer_info['last_full_index'] != last_full_index:
return {'error' : 'Full re-index written during our update. Abandoning'}
with open(SEARCH_BLOCKCHAIN_DATA_FILE, 'w') as fout:
json.dump(existing_names, fout)
with open(SEARCH_PROFILE_DATA_FILE, 'w') as fout:
json.dump(all_profiles, fout)
with open(SEARCH_LAST_INDEX_DATA_FILE, 'w') as fout:
search_indexer_info['last_block_height'] = new_block_height
search_indexer_info['last_subdomain_seq'] = last_subdomain_seq
json.dump(search_indexer_info, fout)
return {'status' : True, 'message' : 'Indexed {} profiles'.format(len(names_updated))}
@@ -138,7 +163,7 @@ def fetch_profiles(max_to_fetch = None, just_test_set = False):
"""
with open(SEARCH_BLOCKCHAIN_DATA_FILE, 'r') as fin:
all_names = json.load(file)
all_names = json.load(fin)
info_resp = proxy.getinfo()
last_block_processed = info_resp['last_block_processed']
@@ -177,12 +202,16 @@ def fetch_profiles(max_to_fetch = None, just_test_set = False):
print "ERROR! Could not obtain lockfile"
return
subdomaindb = subdomains.SubdomainDB()
last_subdomain_seq = subdomaindb.get_last_index()
with open(SEARCH_PROFILE_DATA_FILE, 'w') as fout:
json.dump(all_profiles, fout)
with open(SEARCH_LAST_INDEX_DATA_FILE, 'w') as fout:
search_index_data = {
'last_block_height' : last_block_processed,
'last_full_index' : datetime.now().isoformat()
'last_full_index' : datetime.now().isoformat(),
'last_subdomain_seq' : last_subdomain_seq
}
json.dump(search_index_data, fout)

View File

@@ -1,26 +1,26 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Search
~~~~~
Search
~~~~~
copyright: (c) 2014-2016 by Halfmoon Labs, Inc.
copyright: (c) 2016 by Blockstack.org
copyright: (c) 2014-2016 by Halfmoon Labs, Inc.
copyright: (c) 2016 by Blockstack.org
This file is part of Search.
Search is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Search is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Search is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Search is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Search. If not, see <http://www.gnu.org/licenses/>.
You should have received a copy of the GNU General Public License
along with Search. If not, see <http://www.gnu.org/licenses/>.
"""
import sys
@@ -48,67 +48,67 @@ from api.utils import get_mc_client
mc = get_mc_client()
class QueryThread(threading.Thread):
""" for performing multi-threaded search on three search sub-systems
"""
def __init__(self, query, query_type, limit_results):
threading.Thread.__init__(self)
self.query = query
self.query_type = query_type
self.results = []
self.limit_results = limit_results
self.found_exact_match = False
""" for performing multi-threaded search on three search sub-systems
"""
def __init__(self, query, query_type, limit_results):
threading.Thread.__init__(self)
self.query = query
self.query_type = query_type
self.results = []
self.limit_results = limit_results
self.found_exact_match = False
def run(self):
if(self.query_type == 'people_search'):
self.results = query_people_database(self.query, self.limit_results)
elif(self.query_type == 'twitter_search'):
self.results = query_twitter_database(self.query, self.limit_results)
elif(self.query_type == 'username_search'):
self.results = query_username_database(self.query, self.limit_results)
#self.found_exact_match, self.results = query_company_database(self.query)
if(self.query_type == 'lucene_search'):
self.results = query_lucene_index(self.query, self.limit_results)
def run(self):
if(self.query_type == 'people_search'):
self.results = query_people_database(self.query, self.limit_results)
elif(self.query_type == 'twitter_search'):
self.results = query_twitter_database(self.query, self.limit_results)
elif(self.query_type == 'username_search'):
self.results = query_username_database(self.query, self.limit_results)
#self.found_exact_match, self.results = query_company_database(self.query)
if(self.query_type == 'lucene_search'):
self.results = query_lucene_index(self.query, self.limit_results)
def error_reply(msg, code=-1):
reply = {}
reply['status'] = code
reply['message'] = "ERROR: " + msg
return jsonify(reply)
reply = {}
reply['status'] = code
reply['message'] = "ERROR: " + msg
return jsonify(reply)
def query_people_database(query, limit_results=DEFAULT_LIMIT):
name_search_results = search_people_by_name(query, limit_results)
return fetch_profiles(name_search_results, search_type="name")
name_search_results = search_people_by_name(query, limit_results)
return fetch_profiles(name_search_results, search_type="name")
def query_twitter_database(query, limit_results=DEFAULT_LIMIT):
twitter_search_results = search_people_by_twitter(query, limit_results)
return fetch_profiles(twitter_search_results, search_type="twitter")
twitter_search_results = search_people_by_twitter(query, limit_results)
return fetch_profiles(twitter_search_results, search_type="twitter")
def query_username_database(query, limit_results=DEFAULT_LIMIT):
username_search_results = search_people_by_username(query, limit_results)
return fetch_profiles(username_search_results, search_type="username")
username_search_results = search_people_by_username(query, limit_results)
return fetch_profiles(username_search_results, search_type="username")
def query_lucene_index(query, index, limit_results=DEFAULT_LIMIT):
username_search_results = search_people_by_bio(query, limit_results)
return fetch_profiles(username_search_results, search_type="username")
username_search_results = search_people_by_bio(query, limit_results)
return fetch_profiles(username_search_results, search_type="username")
def test_alphanumeric(query):
""" check if query has only alphanumeric characters or not
"""
""" check if query has only alphanumeric characters or not
"""
import re
valid = re.match(r'^\w+[\s\w]*$', query) is not None
import re
valid = re.match(r'^\w+[\s\w]*$', query) is not None
return True
return True
@searcher.route('/search', methods = ["GET", "POST"], strict_slashes = False)
@@ -116,110 +116,110 @@ def test_alphanumeric(query):
@cache_control(MEMCACHED_TIMEOUT)
def search_by_name():
query = request.args.get('query')
query = request.args.get('query')
results_people = []
results_people = []
if query is None:
return error_reply("No query given")
elif query == '' or query == ' ':
return json.dumps({})
if query is None:
return error_reply("No query given")
elif query == '' or query == ' ':
return json.dumps({})
if MEMCACHED_ENABLED:
if MEMCACHED_ENABLED:
cache_key = str('search_cache_' + query.lower())
cache_reply = mc.get(cache_key)
cache_key = str('search_cache_' + query.lower())
cache_reply = mc.get(cache_key)
# if a cache hit, respond straight away
if(cache_reply is not None):
return jsonify(cache_reply)
# if a cache hit, respond straight away
if(cache_reply is not None):
return jsonify(cache_reply)
new_limit = DEFAULT_LIMIT
new_limit = DEFAULT_LIMIT
try:
new_limit = int(request.values['limit_results'])
except:
pass
try:
new_limit = int(request.values['limit_results'])
except:
pass
if validProofQuery(query):
return search_proofs_index(query)
if validProofQuery(query):
return search_proofs_index(query)
elif test_alphanumeric(query) is False:
pass
elif test_alphanumeric(query) is False:
pass
else:
else:
threads = []
threads = []
t1 = QueryThread(query, 'username_search', new_limit)
t2 = QueryThread(query, 'twitter_search', new_limit)
t3 = QueryThread(query, 'people_search', new_limit)
t1 = QueryThread(query, 'username_search', new_limit)
t2 = QueryThread(query, 'twitter_search', new_limit)
t3 = QueryThread(query, 'people_search', new_limit)
if LUCENE_ENABLED:
t4 = QueryThread(query, 'lucene_search', new_limit)
if LUCENE_ENABLED:
t4 = QueryThread(query, 'lucene_search', new_limit)
threads.append(t1)
threads.append(t2)
threads.append(t3)
threads.append(t1)
threads.append(t2)
threads.append(t3)
if LUCENE_ENABLED:
threads.append(t4)
if LUCENE_ENABLED:
threads.append(t4)
# start all threads
[x.start() for x in threads]
# start all threads
[x.start() for x in threads]
# wait for all of them to finish
[x.join() for x in threads]
# wait for all of them to finish
[x.join() for x in threads]
# at this point all threads have finished and all queries have been performed
# at this point all threads have finished and all queries have been performed
results_username = t1.results
results_twitter = t2.results
results_people = t3.results
results_username = t1.results
results_twitter = t2.results
results_people = t3.results
if LUCENE_ENABLED:
results_bio = t4.results
if LUCENE_ENABLED:
results_bio = t4.results
results_people += results_username + results_twitter
if LUCENE_ENABLED:
results_people += results_bio
results_people += results_username + results_twitter
if LUCENE_ENABLED:
results_people += results_bio
# dedup all results before sending out
from substring_search import dedup_search_results
results_people = dedup_search_results(results_people)
# dedup all results before sending out
from substring_search import dedup_search_results
results_people = dedup_search_results(results_people)
results = {}
results['results'] = results_people[:new_limit]
results = {}
results['results'] = results_people[:new_limit]
if MEMCACHED_ENABLED:
mc.set(cache_key, results, int(time() + MEMCACHED_TIMEOUT))
if MEMCACHED_ENABLED:
mc.set(cache_key, results, int(time() + MEMCACHED_TIMEOUT))
return jsonify(results)
return jsonify(results)
def search_proofs_index(query):
results = {}
results = {}
query = request.args.get('query')
query = request.args.get('query')
if query is None:
return error_reply("No query given")
elif query == '' or query == ' ':
return json.dumps({})
if query is None:
return error_reply("No query given")
elif query == '' or query == ' ':
return json.dumps({})
if MEMCACHED_ENABLED:
if MEMCACHED_ENABLED:
cache_key = str('search_cache_' + query.lower())
cache_reply = mc.get(cache_key)
cache_key = str('search_cache_' + query.lower())
cache_reply = mc.get(cache_key)
# if a cache hit, respond straight away
if(cache_reply is not None):
return jsonify(cache_reply)
# if a cache hit, respond straight away
if(cache_reply is not None):
return jsonify(cache_reply)
results['results'] = search_proofs(query)
results['results'] = search_proofs(query)
if MEMCACHED_ENABLED:
mc.set(cache_key, results, int(time() + MEMCACHED_TIMEOUT))
if MEMCACHED_ENABLED:
mc.set(cache_key, results, int(time() + MEMCACHED_TIMEOUT))
return jsonify(results)
return jsonify(results)

View File

@@ -99,4 +99,8 @@ def validUsername(username):
if a.match(username):
return True
else:
parts = username.split(".")
if len(parts) == 2:
if a.match(parts[0]) and a.match(parts[1]):
return True
return False

View File

@@ -50,7 +50,7 @@ class SearchTestCase(unittest.TestCase):
self.assertIsNotNone(entry['username'])
if entry['username'] not in self.test_users:
continue
self.assertIsNotNone(entry['profile'],
self.assertIsNotNone(entry['profile'],
msg="Error in fetching profile of entry: {}".format(entry))
def test_find_user(self):
@@ -60,3 +60,10 @@ class SearchTestCase(unittest.TestCase):
self.assertIn(u, data['results'][0]['username'])
self.assertIn("profile", data['results'][0].keys())
def test_find_subdomain(self):
for u in ["Thomas Hobbes"]:
data = self.do_search(u)
self.assertTrue(len(data['results']) > 0)
if __name__ == "__main__":
unittest.main()

View File

@@ -36,7 +36,7 @@ from .proxy import (
json_is_error, get_name_blockchain_history, get_name_blockchain_record,
get_default_proxy)
from blockstack_client import storage
from blockstack_client import storage, subdomains
from blockstack_client import user as user_db
from .logger import get_logger
@@ -205,13 +205,13 @@ def get_profile(name, zonefile_storage_drivers=None, profile_storage_drivers=Non
and then loading the profile from that zonefile's public key.
Notes on backwards compatibility (activated if use_legacy=True and use_legacy_zonefile=True):
* (use_legacy=True) If the user's zonefile is really a legacy profile from Onename, then
the profile returned will be the converted legacy profile. The returned zonefile will still
be a legacy profile, however.
The caller can check this and perform the conversion automatically.
* (use_legacy_zonefile=True) If the name points to a current zonefile that does not have a
* (use_legacy_zonefile=True) If the name points to a current zonefile that does not have a
data public key, then the owner address of the name will be used to verify
the profile's authenticity.
@@ -224,6 +224,15 @@ def get_profile(name, zonefile_storage_drivers=None, profile_storage_drivers=Non
proxy = get_default_proxy() if proxy is None else proxy
res = subdomains.is_address_subdomain(str(name))
if res:
subdomain, domain = res[1]
try:
return subdomains.resolve_subdomain(subdomain, domain)
except subdomains.SubdomainNotFound as e:
log.exception(e)
return {'error' : "Failed to find name {}.{}".format(subdomain, domain)}
raw_zonefile = None
if user_zonefile is None:
user_zonefile = get_name_zonefile(

View File

@@ -30,7 +30,7 @@ import virtualchain
from multiprocessing import Pool
from itertools import izip
from blockstack_client import data, storage, config, proxy, schemas, constants
from blockstack_client import storage, config, proxy, schemas, constants
from blockstack_client import zonefile as bs_zonefile
from blockstack_client import user as user_db
from blockstack_client.backend import safety
@@ -178,6 +178,22 @@ class SubdomainDB(object):
return Subdomain(domain_name, subdomain_name, str(encoded_pubkey), int(n), str(zonefile_str), sig, txid)
def get_all_subdomains(self, above_seq = None):
if above_seq:
get_cmd = "SELECT fully_qualified_subdomain FROM {} WHERE sequence >= ?"
else:
get_cmd = "SELECT fully_qualified_subdomain FROM {}"
get_cmd = get_cmd.format(self.subdomain_table)
cursor = self.conn.cursor()
if above_seq:
cursor.execute(get_cmd, (above_seq,))
else:
cursor.execute(get_cmd)
try:
return [ x[0] for x in cursor.fetchall() ]
except:
return []
def get_subdomains_owned_by_address(self, owner):
get_cmd = "SELECT fully_qualified_subdomain FROM {} WHERE owner = ?".format(
self.subdomain_table)
@@ -308,18 +324,34 @@ class SubdomainDB(object):
return 0
return int(last_block)
def get_last_index(self):
"""
Returns the last sequence number for the subdomain DB.
WARNING: this is specific to *this* instance, and *this* DB,
if you use this, it should *only* be as an optimization.
"""
get_cmd = """SELECT sequence FROM {} ORDER BY sequence DESC LIMIT 1""".format(
self.subdomain_table)
cursor = self.conn.cursor()
cursor.execute(get_cmd)
try:
last_seq = cursor.fetchone()[0]
except:
return 0
return int(last_seq)
def _drop_tables(self):
drop_cmd = "DROP TABLE IF EXISTS {};"
cursor = self.conn.cursor()
cursor.execute(drop_cmd.format(self.subdomain_table))
cursor.execute(drop_cmd.format(self.status_table))
cursor.execute(drop_cmd.format(self.status_table))
def _create_tables(self):
create_cmd = """CREATE TABLE IF NOT EXISTS {} (
fully_qualified_subdomain TEXT PRIMARY KEY,
sequence INTEGER,
owner TEXT,
zonefile TEXT,
zonefile TEXT,
signature TEXT,
last_txid TEXT);
""".format(self.subdomain_table)
@@ -344,7 +376,8 @@ def parse_zonefile_subdomains(domain, zonefile_json):
def is_address_subdomain(fqa):
"""
Tests whether fqa is a subdomain.
Tests whether fqa is a subdomain.
@fqa must be a string
If it isn't, returns False.
If it is, returns True and a tuple (subdomain_name, domain)
"""
@@ -448,6 +481,7 @@ def add_subdomains(subdomains, domain_fqa):
def get_subdomain_info(subdomain, domain_fqa, use_cache = True):
if not use_cache:
from blockstack_client import data
zonefiles = data.list_zonefile_history(domain_fqa)
subdomain_db = _build_subdomain_db([domain_fqa for z in zonefiles], zonefiles)
else: