mirror of
https://github.com/lockin-bot/ProfileMatching.git
synced 2026-01-12 09:44:30 +08:00
This module provides comprehensive Twitter-to-Telegram profile matching and verification using 10 different matching methods and LLM verification. Features: - 10 matching methods (phash, usernames, bio handles, URL resolution, fuzzy names) - URL resolution integration for t.co → t.me links - Async LLM verification with GPT-5-mini - Interactive menu system with real-time stats - Threaded candidate finding (~1.5 contacts/sec) - Comprehensive documentation and guides Key Components: - find_twitter_candidates.py: Core matching logic (10 methods) - find_twitter_candidates_threaded.py: Threaded implementation - verify_twitter_matches_v2.py: LLM verification (V5 prompt) - review_match_quality.py: Analysis and quality review - main.py: Interactive menu system - Complete documentation (README, CHANGELOG, QUICKSTART) Performance: - Candidate finding: ~16-18 hours for 43K contacts - LLM verification: ~23 hours for 43K users - Cost: ~$130 for full verification 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
852 lines
33 KiB
Python
Executable File
852 lines
33 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Twitter-Telegram Candidate Finder
|
|
Finds potential Twitter matches for Telegram contacts using:
|
|
1. Handle extraction from bios
|
|
2. Username variation generation
|
|
3. Fuzzy name matching
|
|
"""
|
|
|
|
import sys
|
|
import re
|
|
import json
|
|
from pathlib import Path
|
|
from typing import List, Dict, Set, Tuple
|
|
import psycopg2
|
|
from psycopg2.extras import DictCursor, execute_values
|
|
|
|
# Add parent directory to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
|
|
|
|
from db_config import SessionLocal
|
|
from models import Contact
|
|
|
|
# Twitter database connection (adjust as needed)
|
|
TWITTER_DB_CONFIG = {
|
|
'dbname': 'twitter_data',
|
|
'user': 'andrewjiang', # Adjust to your setup
|
|
'host': 'localhost',
|
|
'port': 5432
|
|
}
|
|
|
|
|
|
class HandleExtractor:
|
|
"""Extract Twitter handles from text"""
|
|
|
|
@staticmethod
|
|
def extract_handles(text: str) -> List[str]:
|
|
"""Extract Twitter handles from bio/text"""
|
|
if not text:
|
|
return []
|
|
|
|
handles = set()
|
|
|
|
# Pattern 1: @username
|
|
pattern1 = r'@([a-zA-Z0-9_]{4,15})'
|
|
handles.update(re.findall(pattern1, text))
|
|
|
|
# Pattern 2: twitter.com/username or x.com/username
|
|
pattern2 = r'(?:twitter\.com|x\.com)/([a-zA-Z0-9_]{4,15})'
|
|
handles.update(re.findall(pattern2, text, re.IGNORECASE))
|
|
|
|
# Pattern 3: Clean standalone handles (risky, be conservative)
|
|
# Only if text is short and looks like a handle
|
|
if len(text) < 30 and text.count('@') == 1:
|
|
clean = text.strip('@').strip()
|
|
if re.match(r'^[a-zA-Z0-9_]{4,15}$', clean):
|
|
handles.add(clean)
|
|
|
|
return [h.lower() for h in handles if len(h) >= 4]
|
|
|
|
|
|
class UsernameVariationGenerator:
|
|
"""Generate Twitter handle variations from Telegram usernames"""
|
|
|
|
@staticmethod
|
|
def generate_variations(telegram_username: str) -> List[str]:
|
|
"""
|
|
Generate possible Twitter handle variations
|
|
|
|
Examples:
|
|
- alice_0x → [alice_0x, 0xalice, 0x_alice, alice0x]
|
|
- trader_69 → [trader_69, 69trader, 69_trader, trader69]
|
|
"""
|
|
if not telegram_username:
|
|
return []
|
|
|
|
variations = [telegram_username.lower()]
|
|
|
|
# Remove underscores
|
|
no_underscore = telegram_username.replace('_', '')
|
|
if no_underscore != telegram_username and len(no_underscore) >= 4:
|
|
variations.append(no_underscore.lower())
|
|
|
|
# Handle "0x" patterns (common in crypto)
|
|
if '0x' in telegram_username.lower():
|
|
# If 0x at end, try moving to front
|
|
if telegram_username.lower().endswith('_0x'):
|
|
base = telegram_username[:-3]
|
|
variations.extend([
|
|
f"0x{base}".lower(),
|
|
f"0x_{base}".lower()
|
|
])
|
|
elif telegram_username.lower().endswith('0x'):
|
|
base = telegram_username[:-2]
|
|
variations.extend([
|
|
f"0x{base}".lower(),
|
|
f"0x_{base}".lower()
|
|
])
|
|
|
|
# Try without underscore
|
|
no_under = telegram_username.replace('_', '')
|
|
if '0x' in no_under.lower() and no_under.lower() not in variations:
|
|
variations.append(no_under.lower())
|
|
|
|
# Handle trailing numbers (alice_69 → 69alice)
|
|
match = re.match(r'^([a-z_]+?)_?(\d+)$', telegram_username, re.IGNORECASE)
|
|
if match:
|
|
prefix, number = match.groups()
|
|
prefix = prefix.rstrip('_')
|
|
if len(f"{number}{prefix}") >= 4:
|
|
variations.extend([
|
|
f"{number}{prefix}".lower(),
|
|
f"{number}_{prefix}".lower()
|
|
])
|
|
|
|
# Handle leading numbers (69_alice → alice69)
|
|
match = re.match(r'^(\d+)_?([a-z_]+)$', telegram_username, re.IGNORECASE)
|
|
if match:
|
|
number, suffix = match.groups()
|
|
suffix = suffix.lstrip('_')
|
|
if len(f"{suffix}{number}") >= 4:
|
|
variations.extend([
|
|
f"{suffix}{number}".lower(),
|
|
f"{suffix}_{number}".lower()
|
|
])
|
|
|
|
# Single character removals (banteg → bantg, trader → trade)
|
|
# This catches shortened versions of usernames
|
|
base = telegram_username.lower()
|
|
if len(base) > 4: # Only if result would be at least 4 chars
|
|
for i in range(len(base)):
|
|
variation = base[:i] + base[i+1:]
|
|
if len(variation) >= 4:
|
|
variations.append(variation)
|
|
|
|
# Deduplicate and validate (4-15 chars for Twitter)
|
|
valid = []
|
|
for v in set(variations):
|
|
v_clean = v.strip('_')
|
|
if 4 <= len(v_clean) <= 15:
|
|
valid.append(v_clean)
|
|
|
|
return list(set(valid))
|
|
|
|
|
|
class TwitterMatcher:
|
|
"""Find Twitter profiles matching Telegram contacts"""
|
|
|
|
def __init__(self, twitter_conn, telegram_conn=None):
|
|
self.twitter_conn = twitter_conn
|
|
self.telegram_conn = telegram_conn # Needed for phash lookups
|
|
self.handle_extractor = HandleExtractor()
|
|
self.variation_generator = UsernameVariationGenerator()
|
|
|
|
def find_by_handle(self, handle: str) -> Dict:
|
|
"""Lookup Twitter profile by exact handle"""
|
|
with self.twitter_conn.cursor(cursor_factory=DictCursor) as cur:
|
|
cur.execute("""
|
|
SELECT
|
|
id,
|
|
username,
|
|
name,
|
|
description,
|
|
location,
|
|
verified,
|
|
is_blue_verified,
|
|
followers_count,
|
|
following_count,
|
|
created_at
|
|
FROM public.users
|
|
WHERE LOWER(username) = %s
|
|
LIMIT 1
|
|
""", (handle.lower(),))
|
|
|
|
result = cur.fetchone()
|
|
return dict(result) if result else None
|
|
|
|
def find_by_fuzzy_name(self, telegram_name: str, limit=3) -> List[Dict]:
|
|
"""Find Twitter profiles with similar names using fuzzy matching"""
|
|
if not telegram_name or len(telegram_name) < 3:
|
|
return []
|
|
|
|
with self.twitter_conn.cursor(cursor_factory=DictCursor) as cur:
|
|
# Use parameterized query with proper escaping for % operator
|
|
cur.execute("""
|
|
SELECT
|
|
id,
|
|
username,
|
|
name,
|
|
description,
|
|
location,
|
|
verified,
|
|
is_blue_verified,
|
|
followers_count,
|
|
following_count,
|
|
created_at,
|
|
similarity(name, %(name)s) AS name_score
|
|
FROM public.users
|
|
WHERE name %% %(name)s -- %% for similarity operator (escaped in string)
|
|
AND similarity(name, %(name)s) > 0.65 -- Increased threshold from 0.5 to reduce noise
|
|
ORDER BY name_score DESC
|
|
LIMIT %(limit)s
|
|
""", {'name': telegram_name, 'limit': limit})
|
|
|
|
return [dict(row) for row in cur.fetchall()]
|
|
|
|
def find_by_display_name_containment(self, telegram_name: str, limit=5) -> List[Dict]:
|
|
"""Find Twitter profiles where TG display name is contained in TW display name"""
|
|
if not telegram_name or len(telegram_name) < 3:
|
|
return []
|
|
|
|
with self.twitter_conn.cursor(cursor_factory=DictCursor) as cur:
|
|
# Direct containment search (case-insensitive)
|
|
cur.execute("""
|
|
SELECT
|
|
id,
|
|
username,
|
|
name,
|
|
description,
|
|
location,
|
|
verified,
|
|
is_blue_verified,
|
|
followers_count,
|
|
following_count,
|
|
created_at
|
|
FROM public.users
|
|
WHERE name ILIKE %s -- TG name contained in TW name
|
|
AND LENGTH(name) >= %s -- TW name must be at least as long as TG name
|
|
ORDER BY followers_count DESC -- Prioritize by follower count
|
|
LIMIT %s
|
|
""", (f'%{telegram_name}%', len(telegram_name), limit))
|
|
|
|
return [dict(row) for row in cur.fetchall()]
|
|
|
|
def find_by_phash_match(self, telegram_user_id: int) -> List[Dict]:
|
|
"""Find Twitter profiles with matching profile picture phash (distance 0-1 only)"""
|
|
if not self.telegram_conn:
|
|
return []
|
|
|
|
with self.telegram_conn.cursor(cursor_factory=DictCursor) as cur:
|
|
# Query pre-computed phash matches (distance 0-1 for high confidence)
|
|
cur.execute("""
|
|
SELECT
|
|
m.twitter_user_id,
|
|
m.twitter_username,
|
|
m.hamming_distance,
|
|
m.telegram_phash,
|
|
m.twitter_phash
|
|
FROM telegram_twitter_phash_matches m
|
|
WHERE m.telegram_user_id = %s
|
|
AND m.hamming_distance <= 1 -- Only exact and distance-1 matches
|
|
ORDER BY m.hamming_distance ASC
|
|
LIMIT 5
|
|
""", (telegram_user_id,))
|
|
|
|
phash_matches = cur.fetchall()
|
|
|
|
# Commit to close transaction
|
|
self.telegram_conn.commit()
|
|
|
|
if not phash_matches:
|
|
return []
|
|
|
|
# Fetch full Twitter profile data for matched users
|
|
twitter_ids = [m['twitter_user_id'] for m in phash_matches]
|
|
|
|
with self.twitter_conn.cursor(cursor_factory=DictCursor) as cur:
|
|
cur.execute("""
|
|
SELECT
|
|
id,
|
|
username,
|
|
name,
|
|
description,
|
|
location,
|
|
verified,
|
|
is_blue_verified,
|
|
followers_count,
|
|
following_count,
|
|
created_at
|
|
FROM public.users
|
|
WHERE id = ANY(%s)
|
|
""", (twitter_ids,))
|
|
|
|
twitter_profiles = [dict(row) for row in cur.fetchall()]
|
|
|
|
# Enrich with phash match details
|
|
twitter_profile_map = {str(p['id']): p for p in twitter_profiles}
|
|
results = []
|
|
|
|
for match in phash_matches:
|
|
tw_id = match['twitter_user_id']
|
|
if tw_id in twitter_profile_map:
|
|
profile = twitter_profile_map[tw_id].copy()
|
|
profile['phash_distance'] = match['hamming_distance']
|
|
profile['telegram_phash'] = match['telegram_phash']
|
|
profile['twitter_phash'] = match['twitter_phash']
|
|
results.append(profile)
|
|
|
|
return results
|
|
|
|
def find_by_telegram_in_twitter_bio(self, telegram_username: str, limit=3) -> List[Dict]:
|
|
"""Find Twitter profiles that mention this Telegram username in their bio (exact @mention only)"""
|
|
if not telegram_username or len(telegram_username) < 4:
|
|
return []
|
|
|
|
with self.twitter_conn.cursor(cursor_factory=DictCursor) as cur:
|
|
# Search for various Telegram handle patterns in Twitter bios
|
|
# Use regex with word boundaries to avoid substring matches in other handles
|
|
cur.execute("""
|
|
SELECT
|
|
id,
|
|
username,
|
|
name,
|
|
description,
|
|
location,
|
|
verified,
|
|
is_blue_verified,
|
|
followers_count,
|
|
following_count,
|
|
created_at
|
|
FROM public.users
|
|
WHERE description IS NOT NULL
|
|
AND (
|
|
description ~* %s -- @username with word boundary (not part of longer handle)
|
|
OR LOWER(description) LIKE %s -- t.me/username
|
|
OR LOWER(description) LIKE %s -- telegram.me/username
|
|
)
|
|
ORDER BY followers_count DESC
|
|
LIMIT %s
|
|
""", (
|
|
r'@' + telegram_username + r'(\s|$|[^a-zA-Z0-9_])', # Word boundary: space, end, or non-alphanumeric
|
|
f'%t.me/{telegram_username.lower()}%',
|
|
f'%telegram.me/{telegram_username.lower()}%',
|
|
limit
|
|
))
|
|
|
|
return [dict(row) for row in cur.fetchall()]
|
|
|
|
def find_by_resolved_url(self, telegram_username: str) -> List[Dict]:
|
|
"""Find Twitter profiles whose bio URL resolves to this Telegram username"""
|
|
if not telegram_username or len(telegram_username) < 4:
|
|
return []
|
|
|
|
with self.twitter_conn.cursor(cursor_factory=DictCursor) as cur:
|
|
# Query url_resolution_queue for Twitter profiles with resolved Telegram links
|
|
cur.execute("""
|
|
SELECT DISTINCT
|
|
u.id,
|
|
u.username,
|
|
u.name,
|
|
u.description,
|
|
u.location,
|
|
u.verified,
|
|
u.is_blue_verified,
|
|
u.followers_count,
|
|
u.following_count,
|
|
u.created_at,
|
|
urq.resolved_url,
|
|
urq.original_url
|
|
FROM url_resolution_queue urq
|
|
JOIN public.users u ON u.id::text = urq.twitter_id
|
|
WHERE urq.telegram_handles IS NOT NULL
|
|
AND urq.telegram_handles @> %s::jsonb
|
|
ORDER BY u.followers_count DESC
|
|
LIMIT 3
|
|
""", (f'["{telegram_username.lower()}"]',))
|
|
|
|
results = cur.fetchall()
|
|
|
|
# Enrich with URL details
|
|
enriched = []
|
|
for row in results:
|
|
profile = dict(row)
|
|
profile['resolved_telegram_url'] = profile.pop('resolved_url')
|
|
profile['original_url'] = profile.pop('original_url')
|
|
enriched.append(profile)
|
|
|
|
return enriched
|
|
|
|
def find_by_username_in_display_name(self, search_term: str, is_telegram: bool, limit: int = 5) -> List[Dict]:
|
|
"""
|
|
Find Twitter profiles where display name contains username pattern
|
|
|
|
If is_telegram=True: Search for TG username in Twitter display names
|
|
If is_telegram=False: Search for Twitter username pattern in TG display names (reverse: find Twitter profiles whose username matches the TG display name)
|
|
"""
|
|
with self.twitter_conn.cursor(cursor_factory=DictCursor) as cur:
|
|
if is_telegram:
|
|
# TG username in Twitter display name
|
|
cur.execute("""
|
|
SELECT
|
|
id,
|
|
username,
|
|
name,
|
|
description,
|
|
location,
|
|
verified,
|
|
is_blue_verified,
|
|
followers_count,
|
|
following_count,
|
|
created_at
|
|
FROM public.users
|
|
WHERE name IS NOT NULL
|
|
AND LOWER(name) LIKE %s
|
|
ORDER BY followers_count DESC
|
|
LIMIT %s
|
|
""", (f'%{search_term.lower()}%', limit))
|
|
else:
|
|
# Twitter username in TG display name - search for Twitter profiles whose username appears in the search term
|
|
cur.execute("""
|
|
SELECT
|
|
id,
|
|
username,
|
|
name,
|
|
description,
|
|
location,
|
|
verified,
|
|
is_blue_verified,
|
|
followers_count,
|
|
following_count,
|
|
created_at
|
|
FROM public.users
|
|
WHERE username IS NOT NULL
|
|
AND LOWER(%s) LIKE '%%' || LOWER(username) || '%%'
|
|
ORDER BY followers_count DESC
|
|
LIMIT %s
|
|
""", (search_term, limit))
|
|
|
|
return [dict(row) for row in cur.fetchall()]
|
|
|
|
def get_display_name(self, contact: Contact) -> str:
|
|
"""Get display name with fallback to first_name + last_name"""
|
|
if contact.display_name:
|
|
return contact.display_name
|
|
# Fallback to first_name + last_name
|
|
parts = [contact.first_name, contact.last_name]
|
|
return ' '.join(p for p in parts if p).strip()
|
|
|
|
def find_candidates_for_contact(self, contact: Contact) -> List[Dict]:
|
|
"""
|
|
Find all Twitter candidates for a single Telegram contact
|
|
|
|
Returns list of candidates with:
|
|
{
|
|
'twitter_profile': {...},
|
|
'match_method': 'exact_bio_handle' | 'exact_username' | 'username_variation' | 'fuzzy_name',
|
|
'baseline_confidence': 0.0-1.0,
|
|
'match_details': {...}
|
|
}
|
|
"""
|
|
candidates = []
|
|
seen_twitter_ids = set()
|
|
|
|
# Get display name with fallback
|
|
display_name = self.get_display_name(contact)
|
|
|
|
# Method 1: Phash matching (profile picture similarity)
|
|
phash_matches = self.find_by_phash_match(contact.user_id)
|
|
for twitter_profile in phash_matches:
|
|
phash_distance = twitter_profile.pop('phash_distance')
|
|
telegram_phash = twitter_profile.pop('telegram_phash')
|
|
twitter_phash = twitter_profile.pop('twitter_phash')
|
|
|
|
# Baseline confidence based on phash distance
|
|
if phash_distance == 0:
|
|
baseline_confidence = 0.95 # Exact match - VERY strong signal
|
|
elif phash_distance == 1:
|
|
baseline_confidence = 0.88 # 1-bit difference - strong signal
|
|
else:
|
|
continue # Skip distance > 1
|
|
|
|
candidates.append({
|
|
'twitter_profile': twitter_profile,
|
|
'match_method': 'phash_match',
|
|
'baseline_confidence': baseline_confidence,
|
|
'match_details': {
|
|
'phash_distance': phash_distance,
|
|
'telegram_phash': telegram_phash,
|
|
'twitter_phash': twitter_phash
|
|
}
|
|
})
|
|
seen_twitter_ids.add(twitter_profile['id'])
|
|
|
|
# Method 2: Extract handles from bio
|
|
if contact.bio:
|
|
bio_handles = self.handle_extractor.extract_handles(contact.bio)
|
|
for handle in bio_handles:
|
|
twitter_profile = self.find_by_handle(handle)
|
|
if twitter_profile and twitter_profile['id'] not in seen_twitter_ids:
|
|
candidates.append({
|
|
'twitter_profile': twitter_profile,
|
|
'match_method': 'exact_bio_handle',
|
|
'baseline_confidence': 0.95,
|
|
'match_details': {
|
|
'extracted_handle': handle,
|
|
'from_bio': True
|
|
}
|
|
})
|
|
seen_twitter_ids.add(twitter_profile['id'])
|
|
|
|
# Method 3: Exact username match
|
|
if contact.username:
|
|
twitter_profile = self.find_by_handle(contact.username)
|
|
if twitter_profile and twitter_profile['id'] not in seen_twitter_ids:
|
|
candidates.append({
|
|
'twitter_profile': twitter_profile,
|
|
'match_method': 'exact_username',
|
|
'baseline_confidence': 0.90,
|
|
'match_details': {
|
|
'telegram_username': contact.username,
|
|
'twitter_username': twitter_profile['username']
|
|
}
|
|
})
|
|
seen_twitter_ids.add(twitter_profile['id'])
|
|
|
|
# Method 4: Username variations
|
|
if contact.username:
|
|
variations = self.variation_generator.generate_variations(contact.username)
|
|
for variation in variations:
|
|
if variation == contact.username.lower():
|
|
continue # Already checked in Method 2
|
|
|
|
twitter_profile = self.find_by_handle(variation)
|
|
if twitter_profile and twitter_profile['id'] not in seen_twitter_ids:
|
|
candidates.append({
|
|
'twitter_profile': twitter_profile,
|
|
'match_method': 'username_variation',
|
|
'baseline_confidence': 0.80,
|
|
'match_details': {
|
|
'telegram_username': contact.username,
|
|
'username_variation': variation,
|
|
'twitter_username': twitter_profile['username']
|
|
}
|
|
})
|
|
seen_twitter_ids.add(twitter_profile['id'])
|
|
|
|
# Method 5: Twitter bio contains Telegram username (reverse lookup)
|
|
if contact.username:
|
|
reverse_matches = self.find_by_telegram_in_twitter_bio(contact.username, limit=3)
|
|
for twitter_profile in reverse_matches:
|
|
if twitter_profile['id'] not in seen_twitter_ids:
|
|
candidates.append({
|
|
'twitter_profile': twitter_profile,
|
|
'match_method': 'twitter_bio_has_telegram',
|
|
'baseline_confidence': 0.92, # Very high confidence - they explicitly mention their Telegram
|
|
'match_details': {
|
|
'telegram_username': contact.username,
|
|
'twitter_username': twitter_profile['username'],
|
|
'found_in_twitter_bio': True
|
|
}
|
|
})
|
|
seen_twitter_ids.add(twitter_profile['id'])
|
|
|
|
# Method 5b: Twitter bio URL resolves to Telegram username (via url_resolution_queue)
|
|
if contact.username:
|
|
url_resolved_matches = self.find_by_resolved_url(contact.username)
|
|
for twitter_profile in url_resolved_matches:
|
|
if twitter_profile['id'] not in seen_twitter_ids:
|
|
resolved_url = twitter_profile.pop('resolved_telegram_url', None)
|
|
original_url = twitter_profile.pop('original_url', None)
|
|
|
|
candidates.append({
|
|
'twitter_profile': twitter_profile,
|
|
'match_method': 'twitter_bio_url_resolves_to_telegram',
|
|
'baseline_confidence': 0.95, # VERY high confidence - explicit URL link in bio
|
|
'match_details': {
|
|
'telegram_username': contact.username,
|
|
'twitter_username': twitter_profile['username'],
|
|
'resolved_url': resolved_url,
|
|
'original_url': original_url,
|
|
'found_via_url_resolution': True
|
|
}
|
|
})
|
|
seen_twitter_ids.add(twitter_profile['id'])
|
|
|
|
# Method 6: Display name containment (TG name in TW name)
|
|
if display_name:
|
|
containment_matches = self.find_by_display_name_containment(display_name, limit=5)
|
|
for twitter_profile in containment_matches:
|
|
if twitter_profile['id'] not in seen_twitter_ids:
|
|
candidates.append({
|
|
'twitter_profile': twitter_profile,
|
|
'match_method': 'display_name_containment',
|
|
'baseline_confidence': 0.92, # High confidence for exact name containment
|
|
'match_details': {
|
|
'telegram_name': display_name,
|
|
'twitter_name': twitter_profile['name'],
|
|
'match_type': 'tg_name_contained_in_tw_name'
|
|
}
|
|
})
|
|
seen_twitter_ids.add(twitter_profile['id'])
|
|
|
|
# Method 7: Fuzzy name match (always run to find additional candidates)
|
|
if display_name:
|
|
fuzzy_matches = self.find_by_fuzzy_name(display_name, limit=5)
|
|
for i, twitter_profile in enumerate(fuzzy_matches):
|
|
name_score = twitter_profile.pop('name_score')
|
|
|
|
# Calculate baseline confidence from name similarity
|
|
baseline_confidence = min(0.85, name_score) # Cap at 0.85 for fuzzy matches
|
|
|
|
if twitter_profile['id'] not in seen_twitter_ids:
|
|
candidates.append({
|
|
'twitter_profile': twitter_profile,
|
|
'match_method': 'fuzzy_name',
|
|
'baseline_confidence': baseline_confidence,
|
|
'match_details': {
|
|
'telegram_name': display_name,
|
|
'twitter_name': twitter_profile['name'],
|
|
'fuzzy_score': name_score,
|
|
'candidate_rank': i + 1
|
|
}
|
|
})
|
|
seen_twitter_ids.add(twitter_profile['id'])
|
|
|
|
# Method 8: TG username in Twitter display name
|
|
if contact.username:
|
|
matches = self.find_by_username_in_display_name(contact.username, is_telegram=True, limit=3)
|
|
for twitter_profile in matches:
|
|
if twitter_profile['id'] not in seen_twitter_ids:
|
|
candidates.append({
|
|
'twitter_profile': twitter_profile,
|
|
'match_method': 'tg_username_in_twitter_name',
|
|
'baseline_confidence': 0.88,
|
|
'match_details': {
|
|
'telegram_username': contact.username,
|
|
'twitter_name': twitter_profile['name'],
|
|
'found_in_display_name': True
|
|
}
|
|
})
|
|
seen_twitter_ids.add(twitter_profile['id'])
|
|
|
|
# Method 9: Twitter username in TG display name
|
|
if display_name:
|
|
matches = self.find_by_username_in_display_name(display_name, is_telegram=False, limit=3)
|
|
for twitter_profile in matches:
|
|
if twitter_profile['id'] not in seen_twitter_ids:
|
|
candidates.append({
|
|
'twitter_profile': twitter_profile,
|
|
'match_method': 'twitter_username_in_tg_name',
|
|
'baseline_confidence': 0.86,
|
|
'match_details': {
|
|
'telegram_name': display_name,
|
|
'twitter_username': twitter_profile['username'],
|
|
'username_in_display_name': True
|
|
}
|
|
})
|
|
seen_twitter_ids.add(twitter_profile['id'])
|
|
|
|
return candidates
|
|
|
|
|
|
def save_candidates_to_db(telegram_user_id: int, account_id: int, candidates: List[Dict], telegram_db):
|
|
"""Save candidates to twitter_match_candidates table"""
|
|
if not candidates:
|
|
return
|
|
|
|
insert_data = []
|
|
for cand in candidates:
|
|
tw = cand['twitter_profile']
|
|
insert_data.append((
|
|
account_id, # Add account_id
|
|
telegram_user_id,
|
|
tw['id'],
|
|
tw['username'],
|
|
tw['name'],
|
|
tw.get('description', ''),
|
|
tw.get('location'),
|
|
tw.get('verified', False),
|
|
tw.get('is_blue_verified', False),
|
|
tw.get('followers_count', 0),
|
|
cand.get('match_details', {}).get('candidate_rank', 1),
|
|
cand['match_method'],
|
|
cand['baseline_confidence'],
|
|
json.dumps(cand['match_details']) # Convert dict to JSON string
|
|
))
|
|
|
|
execute_values(telegram_db, """
|
|
INSERT INTO twitter_match_candidates (
|
|
account_id,
|
|
telegram_user_id,
|
|
twitter_id,
|
|
twitter_username,
|
|
twitter_name,
|
|
twitter_bio,
|
|
twitter_location,
|
|
twitter_verified,
|
|
twitter_blue_verified,
|
|
twitter_followers_count,
|
|
candidate_rank,
|
|
match_method,
|
|
baseline_confidence,
|
|
match_signals
|
|
) VALUES %s
|
|
ON CONFLICT DO NOTHING
|
|
""", insert_data, template="""(
|
|
%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
|
|
)""")
|
|
|
|
|
|
def main():
|
|
print()
|
|
print("=" * 70)
|
|
print("🔍 Twitter-Telegram Candidate Finder")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Check arguments
|
|
test_mode = '--test' in sys.argv
|
|
limit = 100 if test_mode else None
|
|
|
|
# Check for --limit parameter
|
|
if '--limit' in sys.argv:
|
|
idx = sys.argv.index('--limit')
|
|
limit = int(sys.argv[idx + 1])
|
|
print(f"📊 LIMIT MODE: Processing first {limit:,} contacts")
|
|
print()
|
|
elif test_mode:
|
|
print("🧪 TEST MODE: Processing first 100 contacts only")
|
|
print()
|
|
|
|
# Connect to databases
|
|
print("📡 Connecting to databases...")
|
|
|
|
# Connect SQLAlchemy to localhost (not RDS)
|
|
from sqlalchemy import create_engine
|
|
from sqlalchemy.orm import sessionmaker
|
|
localhost_engine = create_engine('postgresql://andrewjiang@localhost:5432/telegram_contacts')
|
|
LocalSession = sessionmaker(bind=localhost_engine)
|
|
telegram_db = LocalSession()
|
|
|
|
try:
|
|
twitter_conn = psycopg2.connect(**TWITTER_DB_CONFIG)
|
|
twitter_conn.autocommit = False
|
|
except Exception as e:
|
|
print(f"❌ Failed to connect to Twitter database: {e}")
|
|
print(f" Config: {TWITTER_DB_CONFIG}")
|
|
return False
|
|
|
|
# Also need psycopg2 connection to telegram DB for writing
|
|
try:
|
|
telegram_conn = psycopg2.connect(dbname='telegram_contacts', user='andrewjiang', host='localhost', port=5432)
|
|
telegram_conn.autocommit = False
|
|
except Exception as e:
|
|
print(f"❌ Failed to connect to Telegram database: {e}")
|
|
return False
|
|
|
|
print("✅ Connected to both databases")
|
|
print()
|
|
|
|
try:
|
|
# Initialize matcher (pass both connections for phash lookup)
|
|
telegram_psycopg_conn = psycopg2.connect(
|
|
dbname='telegram_contacts',
|
|
user='andrewjiang',
|
|
host='localhost'
|
|
)
|
|
matcher = TwitterMatcher(twitter_conn, telegram_psycopg_conn)
|
|
|
|
# Get Telegram contacts with bios or usernames
|
|
print("🔍 Loading Telegram contacts...")
|
|
query = telegram_db.query(Contact).filter(
|
|
Contact.user_id > 0, # Exclude channels
|
|
Contact.is_deleted == False,
|
|
Contact.is_bot == False
|
|
).filter(
|
|
(Contact.bio != None) | (Contact.username != None)
|
|
).order_by(Contact.user_id)
|
|
|
|
if limit:
|
|
query = query.limit(limit)
|
|
|
|
contacts = query.all()
|
|
print(f"✅ Found {len(contacts):,} contacts to process")
|
|
print()
|
|
|
|
# Process each contact
|
|
stats = {
|
|
'processed': 0,
|
|
'with_candidates': 0,
|
|
'total_candidates': 0,
|
|
'by_method': {}
|
|
}
|
|
|
|
print("🚀 Finding Twitter candidates...")
|
|
print()
|
|
|
|
for i, contact in enumerate(contacts, 1):
|
|
candidates = matcher.find_candidates_for_contact(contact)
|
|
|
|
if candidates:
|
|
stats['with_candidates'] += 1
|
|
stats['total_candidates'] += len(candidates)
|
|
|
|
# Track by method
|
|
for cand in candidates:
|
|
method = cand['match_method']
|
|
stats['by_method'][method] = stats['by_method'].get(method, 0) + 1
|
|
|
|
# Save to database
|
|
with telegram_conn.cursor() as cur:
|
|
save_candidates_to_db(contact.user_id, contact.account_id, candidates, cur)
|
|
telegram_conn.commit()
|
|
|
|
stats['processed'] += 1
|
|
|
|
# Progress update every 10 contacts
|
|
if i % 10 == 0:
|
|
print(f" Processed {i:,}/{len(contacts):,} contacts... (candidates: {stats['with_candidates']:,}, total: {stats['total_candidates']:,})")
|
|
elif i == 1:
|
|
# Show first contact immediately
|
|
print(f" Processed 1/{len(contacts):,} contacts... (candidates: {stats['with_candidates']:,}, total: {stats['total_candidates']:,})")
|
|
|
|
# Final stats
|
|
print()
|
|
print("=" * 70)
|
|
print("✅ CANDIDATE FINDING COMPLETE")
|
|
print("=" * 70)
|
|
print()
|
|
print(f"📊 Statistics:")
|
|
print(f" Processed: {stats['processed']:,} contacts")
|
|
print(f" With candidates: {stats['with_candidates']:,} ({stats['with_candidates']/stats['processed']*100:.1f}%)")
|
|
print(f" Total candidates: {stats['total_candidates']:,}")
|
|
print(f" Avg candidates per match: {stats['total_candidates']/max(stats['with_candidates'], 1):.1f}")
|
|
print()
|
|
print(f"📈 By method:")
|
|
for method, count in sorted(stats['by_method'].items(), key=lambda x: -x[1]):
|
|
print(f" {method}: {count:,}")
|
|
print()
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
finally:
|
|
telegram_db.close()
|
|
twitter_conn.close()
|
|
telegram_conn.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
success = main()
|
|
sys.exit(0 if success else 1)
|
|
except KeyboardInterrupt:
|
|
print("\n\n⚠️ Interrupted by user")
|
|
sys.exit(1)
|