ProfileMatching/find_twitter_candidates.py

#!/usr/bin/env python3
"""
Twitter-Telegram Candidate Finder
Finds potential Twitter matches for Telegram contacts using:
1. Handle extraction from bios
2. Username variation generation
3. Fuzzy name matching
"""

import sys
import re
import json
from pathlib import Path
from typing import List, Dict, Set, Tuple
import psycopg2
from psycopg2.extras import DictCursor, execute_values

# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))

from db_config import SessionLocal
from models import Contact

# Twitter database connection (adjust as needed)
TWITTER_DB_CONFIG = {
    'dbname': 'twitter_data',
    'user': 'andrewjiang',  # Adjust to your setup
    'host': 'localhost',
    'port': 5432
}


class HandleExtractor:
    """Extract Twitter handles from text"""

    @staticmethod
    def extract_handles(text: str) -> List[str]:
        """Extract Twitter handles from bio/text"""
        if not text:
            return []

        handles = set()

        # Pattern 1: @username
        pattern1 = r'@([a-zA-Z0-9_]{4,15})'
        handles.update(re.findall(pattern1, text))

        # Pattern 2: twitter.com/username or x.com/username
        pattern2 = r'(?:twitter\.com|x\.com)/([a-zA-Z0-9_]{4,15})'
        handles.update(re.findall(pattern2, text, re.IGNORECASE))

        # Pattern 3: Clean standalone handles (risky, be conservative)
        # Only if text is short and looks like a handle
        if len(text) < 30 and text.count('@') == 1:
            clean = text.strip('@').strip()
            if re.match(r'^[a-zA-Z0-9_]{4,15}$', clean):
                handles.add(clean)

        return [h.lower() for h in handles if len(h) >= 4]


class UsernameVariationGenerator:
    """Generate Twitter handle variations from Telegram usernames"""

    @staticmethod
    def generate_variations(telegram_username: str) -> List[str]:
        """
        Generate possible Twitter handle variations

        Examples:
        - alice_0x → [alice_0x, 0xalice, 0x_alice, alice0x]
        - trader_69 → [trader_69, 69trader, 69_trader, trader69]
        """
        if not telegram_username:
            return []

        variations = [telegram_username.lower()]

        # Remove underscores
        no_underscore = telegram_username.replace('_', '')
        if no_underscore != telegram_username and len(no_underscore) >= 4:
            variations.append(no_underscore.lower())

        # Handle "0x" patterns (common in crypto)
        if '0x' in telegram_username.lower():
            # If 0x at end, try moving to front
            if telegram_username.lower().endswith('_0x'):
                base = telegram_username[:-3]
                variations.extend([
                    f"0x{base}".lower(),
                    f"0x_{base}".lower()
                ])
            elif telegram_username.lower().endswith('0x'):
                base = telegram_username[:-2]
                variations.extend([
                    f"0x{base}".lower(),
                    f"0x_{base}".lower()
                ])

            # Try without underscore
            no_under = telegram_username.replace('_', '')
            if '0x' in no_under.lower() and no_under.lower() not in variations:
                variations.append(no_under.lower())

        # Handle trailing numbers (alice_69 → 69alice)
        match = re.match(r'^([a-z_]+?)_?(\d+)$', telegram_username, re.IGNORECASE)
        if match:
            prefix, number = match.groups()
            prefix = prefix.rstrip('_')
            if len(f"{number}{prefix}") >= 4:
                variations.extend([
                    f"{number}{prefix}".lower(),
                    f"{number}_{prefix}".lower()
                ])

        # Handle leading numbers (69_alice → alice69)
        match = re.match(r'^(\d+)_?([a-z_]+)$', telegram_username, re.IGNORECASE)
        if match:
            number, suffix = match.groups()
            suffix = suffix.lstrip('_')
            if len(f"{suffix}{number}") >= 4:
                variations.extend([
                    f"{suffix}{number}".lower(),
                    f"{suffix}_{number}".lower()
                ])

        # Single character removals (banteg → bantg, trader → trade)
        # This catches shortened versions of usernames
        base = telegram_username.lower()
        if len(base) > 4:  # Only if result would be at least 4 chars
            for i in range(len(base)):
                variation = base[:i] + base[i+1:]
                if len(variation) >= 4:
                    variations.append(variation)

        # Deduplicate and validate (4-15 chars for Twitter)
        valid = []
        for v in set(variations):
            v_clean = v.strip('_')
            if 4 <= len(v_clean) <= 15:
                valid.append(v_clean)

        return list(set(valid))


class TwitterMatcher:
    """Find Twitter profiles matching Telegram contacts"""

    def __init__(self, twitter_conn, telegram_conn=None):
        self.twitter_conn = twitter_conn
        self.telegram_conn = telegram_conn  # Needed for phash lookups
        self.handle_extractor = HandleExtractor()
        self.variation_generator = UsernameVariationGenerator()

    def find_by_handle(self, handle: str) -> Dict:
        """Lookup Twitter profile by exact handle"""
        with self.twitter_conn.cursor(cursor_factory=DictCursor) as cur:
            cur.execute("""
                SELECT
                    id,
                    username,
                    name,
                    description,
                    location,
                    verified,
                    is_blue_verified,
                    followers_count,
                    following_count,
                    created_at
                FROM public.users
                WHERE LOWER(username) = %s
                LIMIT 1
            """, (handle.lower(),))

            result = cur.fetchone()
            return dict(result) if result else None

    def find_by_fuzzy_name(self, telegram_name: str, limit=3) -> List[Dict]:
        """Find Twitter profiles with similar names using fuzzy matching"""
        if not telegram_name or len(telegram_name) < 3:
            return []

        with self.twitter_conn.cursor(cursor_factory=DictCursor) as cur:
            # Use parameterized query with proper escaping for % operator
            cur.execute("""
                SELECT
                    id,
                    username,
                    name,
                    description,
                    location,
                    verified,
                    is_blue_verified,
                    followers_count,
                    following_count,
                    created_at,
                    similarity(name, %(name)s) AS name_score
                FROM public.users
                WHERE name %% %(name)s  -- %% for similarity operator (escaped in string)
                  AND similarity(name, %(name)s) > 0.65  -- Increased threshold from 0.5 to reduce noise
                ORDER BY name_score DESC
                LIMIT %(limit)s
            """, {'name': telegram_name, 'limit': limit})

            return [dict(row) for row in cur.fetchall()]

    def find_by_display_name_containment(self, telegram_name: str, limit=5) -> List[Dict]:
        """Find Twitter profiles where TG display name is contained in TW display name"""
        if not telegram_name or len(telegram_name) < 3:
            return []

        with self.twitter_conn.cursor(cursor_factory=DictCursor) as cur:
            # Direct containment search (case-insensitive)
            cur.execute("""
                SELECT
                    id,
                    username,
                    name,
                    description,
                    location,
                    verified,
                    is_blue_verified,
                    followers_count,
                    following_count,
                    created_at
                FROM public.users
                WHERE name ILIKE %s  -- TG name contained in TW name
                  AND LENGTH(name) >= %s  -- TW name must be at least as long as TG name
                ORDER BY followers_count DESC  -- Prioritize by follower count
                LIMIT %s
            """, (f'%{telegram_name}%', len(telegram_name), limit))

            return [dict(row) for row in cur.fetchall()]

    def find_by_phash_match(self, telegram_user_id: int) -> List[Dict]:
        """Find Twitter profiles with matching profile picture phash (distance 0-1 only)"""
        if not self.telegram_conn:
            return []

        with self.telegram_conn.cursor(cursor_factory=DictCursor) as cur:
            # Query pre-computed phash matches (distance 0-1 for high confidence)
            cur.execute("""
                SELECT
                    m.twitter_user_id,
                    m.twitter_username,
                    m.hamming_distance,
                    m.telegram_phash,
                    m.twitter_phash
                FROM telegram_twitter_phash_matches m
                WHERE m.telegram_user_id = %s
                  AND m.hamming_distance <= 1  -- Only exact and distance-1 matches
                ORDER BY m.hamming_distance ASC
                LIMIT 5
            """, (telegram_user_id,))

            phash_matches = cur.fetchall()

        # Commit to close transaction
        self.telegram_conn.commit()

        if not phash_matches:
            return []

        # Fetch full Twitter profile data for matched users
        twitter_ids = [m['twitter_user_id'] for m in phash_matches]

        with self.twitter_conn.cursor(cursor_factory=DictCursor) as cur:
            cur.execute("""
                SELECT
                    id,
                    username,
                    name,
                    description,
                    location,
                    verified,
                    is_blue_verified,
                    followers_count,
                    following_count,
                    created_at
                FROM public.users
                WHERE id = ANY(%s)
            """, (twitter_ids,))

            twitter_profiles = [dict(row) for row in cur.fetchall()]

        # Enrich with phash match details
        twitter_profile_map = {str(p['id']): p for p in twitter_profiles}
        results = []

        for match in phash_matches:
            tw_id = match['twitter_user_id']
            if tw_id in twitter_profile_map:
                profile = twitter_profile_map[tw_id].copy()
                profile['phash_distance'] = match['hamming_distance']
                profile['telegram_phash'] = match['telegram_phash']
                profile['twitter_phash'] = match['twitter_phash']
                results.append(profile)

        return results

    def find_by_telegram_in_twitter_bio(self, telegram_username: str, limit=3) -> List[Dict]:
        """Find Twitter profiles that mention this Telegram username in their bio (exact @mention only)"""
        if not telegram_username or len(telegram_username) < 4:
            return []

        with self.twitter_conn.cursor(cursor_factory=DictCursor) as cur:
            # Search for various Telegram handle patterns in Twitter bios
            # Use regex with word boundaries to avoid substring matches in other handles
            cur.execute("""
                SELECT
                    id,
                    username,
                    name,
                    description,
                    location,
                    verified,
                    is_blue_verified,
                    followers_count,
                    following_count,
                    created_at
                FROM public.users
                WHERE description IS NOT NULL
                  AND (
                    description ~* %s  -- @username with word boundary (not part of longer handle)
                    OR LOWER(description) LIKE %s  -- t.me/username
                    OR LOWER(description) LIKE %s  -- telegram.me/username
                  )
                ORDER BY followers_count DESC
                LIMIT %s
            """, (
                r'@' + telegram_username + r'(\s|$|[^a-zA-Z0-9_])',  # Word boundary: space, end, or non-alphanumeric
                f'%t.me/{telegram_username.lower()}%',
                f'%telegram.me/{telegram_username.lower()}%',
                limit
            ))

            return [dict(row) for row in cur.fetchall()]

    def find_by_resolved_url(self, telegram_username: str) -> List[Dict]:
        """Find Twitter profiles whose bio URL resolves to this Telegram username"""
        if not telegram_username or len(telegram_username) < 4:
            return []

        with self.twitter_conn.cursor(cursor_factory=DictCursor) as cur:
            # Query url_resolution_queue for Twitter profiles with resolved Telegram links
            cur.execute("""
                SELECT DISTINCT
                    u.id,
                    u.username,
                    u.name,
                    u.description,
                    u.location,
                    u.verified,
                    u.is_blue_verified,
                    u.followers_count,
                    u.following_count,
                    u.created_at,
                    urq.resolved_url,
                    urq.original_url
                FROM url_resolution_queue urq
                JOIN public.users u ON u.id::text = urq.twitter_id
                WHERE urq.telegram_handles IS NOT NULL
                  AND urq.telegram_handles @> %s::jsonb
                ORDER BY u.followers_count DESC
                LIMIT 3
            """, (f'["{telegram_username.lower()}"]',))

            results = cur.fetchall()

            # Enrich with URL details
            enriched = []
            for row in results:
                profile = dict(row)
                profile['resolved_telegram_url'] = profile.pop('resolved_url')
                profile['original_url'] = profile.pop('original_url')
                enriched.append(profile)

            return enriched

    def find_by_username_in_display_name(self, search_term: str, is_telegram: bool, limit: int = 5) -> List[Dict]:
        """
        Find Twitter profiles where display name contains username pattern

        If is_telegram=True: Search for TG username in Twitter display names
        If is_telegram=False: Search for Twitter username pattern in TG display names (reverse: find Twitter profiles whose username matches the TG display name)
        """
        with self.twitter_conn.cursor(cursor_factory=DictCursor) as cur:
            if is_telegram:
                # TG username in Twitter display name
                cur.execute("""
                    SELECT
                        id,
                        username,
                        name,
                        description,
                        location,
                        verified,
                        is_blue_verified,
                        followers_count,
                        following_count,
                        created_at
                    FROM public.users
                    WHERE name IS NOT NULL
                      AND LOWER(name) LIKE %s
                    ORDER BY followers_count DESC
                    LIMIT %s
                """, (f'%{search_term.lower()}%', limit))
            else:
                # Twitter username in TG display name - search for Twitter profiles whose username appears in the search term
                cur.execute("""
                    SELECT
                        id,
                        username,
                        name,
                        description,
                        location,
                        verified,
                        is_blue_verified,
                        followers_count,
                        following_count,
                        created_at
                    FROM public.users
                    WHERE username IS NOT NULL
                      AND LOWER(%s) LIKE '%%' || LOWER(username) || '%%'
                    ORDER BY followers_count DESC
                    LIMIT %s
                """, (search_term, limit))

            return [dict(row) for row in cur.fetchall()]

    def get_display_name(self, contact: Contact) -> str:
        """Get display name with fallback to first_name + last_name"""
        if contact.display_name:
            return contact.display_name
        # Fallback to first_name + last_name
        parts = [contact.first_name, contact.last_name]
        return ' '.join(p for p in parts if p).strip()

    def find_candidates_for_contact(self, contact: Contact) -> List[Dict]:
        """
        Find all Twitter candidates for a single Telegram contact

        Returns list of candidates with:
        {
            'twitter_profile': {...},
            'match_method': 'exact_bio_handle' | 'exact_username' | 'username_variation' | 'fuzzy_name',
            'baseline_confidence': 0.0-1.0,
            'match_details': {...}
        }
        """
        candidates = []
        seen_twitter_ids = set()

        # Get display name with fallback
        display_name = self.get_display_name(contact)

        # Method 1: Phash matching (profile picture similarity)
        phash_matches = self.find_by_phash_match(contact.user_id)
        for twitter_profile in phash_matches:
            phash_distance = twitter_profile.pop('phash_distance')
            telegram_phash = twitter_profile.pop('telegram_phash')
            twitter_phash = twitter_profile.pop('twitter_phash')

            # Baseline confidence based on phash distance
            if phash_distance == 0:
                baseline_confidence = 0.95  # Exact match - VERY strong signal
            elif phash_distance == 1:
                baseline_confidence = 0.88  # 1-bit difference - strong signal
            else:
                continue  # Skip distance > 1

            candidates.append({
                'twitter_profile': twitter_profile,
                'match_method': 'phash_match',
                'baseline_confidence': baseline_confidence,
                'match_details': {
                    'phash_distance': phash_distance,
                    'telegram_phash': telegram_phash,
                    'twitter_phash': twitter_phash
                }
            })
            seen_twitter_ids.add(twitter_profile['id'])

        # Method 2: Extract handles from bio
        if contact.bio:
            bio_handles = self.handle_extractor.extract_handles(contact.bio)
            for handle in bio_handles:
                twitter_profile = self.find_by_handle(handle)
                if twitter_profile and twitter_profile['id'] not in seen_twitter_ids:
                    candidates.append({
                        'twitter_profile': twitter_profile,
                        'match_method': 'exact_bio_handle',
                        'baseline_confidence': 0.95,
                        'match_details': {
                            'extracted_handle': handle,
                            'from_bio': True
                        }
                    })
                    seen_twitter_ids.add(twitter_profile['id'])

        # Method 3: Exact username match
        if contact.username:
            twitter_profile = self.find_by_handle(contact.username)
            if twitter_profile and twitter_profile['id'] not in seen_twitter_ids:
                candidates.append({
                    'twitter_profile': twitter_profile,
                    'match_method': 'exact_username',
                    'baseline_confidence': 0.90,
                    'match_details': {
                        'telegram_username': contact.username,
                        'twitter_username': twitter_profile['username']
                    }
                })
                seen_twitter_ids.add(twitter_profile['id'])

        # Method 4: Username variations
        if contact.username:
            variations = self.variation_generator.generate_variations(contact.username)
            for variation in variations:
                if variation == contact.username.lower():
                    continue  # Already checked in Method 2

                twitter_profile = self.find_by_handle(variation)
                if twitter_profile and twitter_profile['id'] not in seen_twitter_ids:
                    candidates.append({
                        'twitter_profile': twitter_profile,
                        'match_method': 'username_variation',
                        'baseline_confidence': 0.80,
                        'match_details': {
                            'telegram_username': contact.username,
                            'username_variation': variation,
                            'twitter_username': twitter_profile['username']
                        }
                    })
                    seen_twitter_ids.add(twitter_profile['id'])

        # Method 5: Twitter bio contains Telegram username (reverse lookup)
        if contact.username:
            reverse_matches = self.find_by_telegram_in_twitter_bio(contact.username, limit=3)
            for twitter_profile in reverse_matches:
                if twitter_profile['id'] not in seen_twitter_ids:
                    candidates.append({
                        'twitter_profile': twitter_profile,
                        'match_method': 'twitter_bio_has_telegram',
                        'baseline_confidence': 0.92,  # Very high confidence - they explicitly mention their Telegram
                        'match_details': {
                            'telegram_username': contact.username,
                            'twitter_username': twitter_profile['username'],
                            'found_in_twitter_bio': True
                        }
                    })
                    seen_twitter_ids.add(twitter_profile['id'])

        # Method 5b: Twitter bio URL resolves to Telegram username (via url_resolution_queue)
        if contact.username:
            url_resolved_matches = self.find_by_resolved_url(contact.username)
            for twitter_profile in url_resolved_matches:
                if twitter_profile['id'] not in seen_twitter_ids:
                    resolved_url = twitter_profile.pop('resolved_telegram_url', None)
                    original_url = twitter_profile.pop('original_url', None)

                    candidates.append({
                        'twitter_profile': twitter_profile,
                        'match_method': 'twitter_bio_url_resolves_to_telegram',
                        'baseline_confidence': 0.95,  # VERY high confidence - explicit URL link in bio
                        'match_details': {
                            'telegram_username': contact.username,
                            'twitter_username': twitter_profile['username'],
                            'resolved_url': resolved_url,
                            'original_url': original_url,
                            'found_via_url_resolution': True
                        }
                    })
                    seen_twitter_ids.add(twitter_profile['id'])

        # Method 6: Display name containment (TG name in TW name)
        if display_name:
            containment_matches = self.find_by_display_name_containment(display_name, limit=5)
            for twitter_profile in containment_matches:
                if twitter_profile['id'] not in seen_twitter_ids:
                    candidates.append({
                        'twitter_profile': twitter_profile,
                        'match_method': 'display_name_containment',
                        'baseline_confidence': 0.92,  # High confidence for exact name containment
                        'match_details': {
                            'telegram_name': display_name,
                            'twitter_name': twitter_profile['name'],
                            'match_type': 'tg_name_contained_in_tw_name'
                        }
                    })
                    seen_twitter_ids.add(twitter_profile['id'])

        # Method 7: Fuzzy name match (always run to find additional candidates)
        if display_name:
            fuzzy_matches = self.find_by_fuzzy_name(display_name, limit=5)
            for i, twitter_profile in enumerate(fuzzy_matches):
                name_score = twitter_profile.pop('name_score')

                # Calculate baseline confidence from name similarity
                baseline_confidence = min(0.85, name_score)  # Cap at 0.85 for fuzzy matches

                if twitter_profile['id'] not in seen_twitter_ids:
                    candidates.append({
                        'twitter_profile': twitter_profile,
                        'match_method': 'fuzzy_name',
                        'baseline_confidence': baseline_confidence,
                        'match_details': {
                            'telegram_name': display_name,
                            'twitter_name': twitter_profile['name'],
                            'fuzzy_score': name_score,
                            'candidate_rank': i + 1
                        }
                    })
                    seen_twitter_ids.add(twitter_profile['id'])

        # Method 8: TG username in Twitter display name
        if contact.username:
            matches = self.find_by_username_in_display_name(contact.username, is_telegram=True, limit=3)
            for twitter_profile in matches:
                if twitter_profile['id'] not in seen_twitter_ids:
                    candidates.append({
                        'twitter_profile': twitter_profile,
                        'match_method': 'tg_username_in_twitter_name',
                        'baseline_confidence': 0.88,
                        'match_details': {
                            'telegram_username': contact.username,
                            'twitter_name': twitter_profile['name'],
                            'found_in_display_name': True
                        }
                    })
                    seen_twitter_ids.add(twitter_profile['id'])

        # Method 9: Twitter username in TG display name
        if display_name:
            matches = self.find_by_username_in_display_name(display_name, is_telegram=False, limit=3)
            for twitter_profile in matches:
                if twitter_profile['id'] not in seen_twitter_ids:
                    candidates.append({
                        'twitter_profile': twitter_profile,
                        'match_method': 'twitter_username_in_tg_name',
                        'baseline_confidence': 0.86,
                        'match_details': {
                            'telegram_name': display_name,
                            'twitter_username': twitter_profile['username'],
                            'username_in_display_name': True
                        }
                    })
                    seen_twitter_ids.add(twitter_profile['id'])

        return candidates


def save_candidates_to_db(telegram_user_id: int, account_id: int, candidates: List[Dict], telegram_db):
    """Save candidates to twitter_match_candidates table"""
    if not candidates:
        return

    insert_data = []
    for cand in candidates:
        tw = cand['twitter_profile']
        insert_data.append((
            account_id,  # Add account_id
            telegram_user_id,
            tw['id'],
            tw['username'],
            tw['name'],
            tw.get('description', ''),
            tw.get('location'),
            tw.get('verified', False),
            tw.get('is_blue_verified', False),
            tw.get('followers_count', 0),
            cand.get('match_details', {}).get('candidate_rank', 1),
            cand['match_method'],
            cand['baseline_confidence'],
            json.dumps(cand['match_details'])  # Convert dict to JSON string
        ))

    execute_values(telegram_db, """
        INSERT INTO twitter_match_candidates (
            account_id,
            telegram_user_id,
            twitter_id,
            twitter_username,
            twitter_name,
            twitter_bio,
            twitter_location,
            twitter_verified,
            twitter_blue_verified,
            twitter_followers_count,
            candidate_rank,
            match_method,
            baseline_confidence,
            match_signals
        ) VALUES %s
        ON CONFLICT DO NOTHING
    """, insert_data, template="""(
        %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
    )""")


def main():
    print()
    print("=" * 70)
    print("🔍 Twitter-Telegram Candidate Finder")
    print("=" * 70)
    print()

    # Check arguments
    test_mode = '--test' in sys.argv
    limit = 100 if test_mode else None

    # Check for --limit parameter
    if '--limit' in sys.argv:
        idx = sys.argv.index('--limit')
        limit = int(sys.argv[idx + 1])
        print(f"📊 LIMIT MODE: Processing first {limit:,} contacts")
        print()
    elif test_mode:
        print("🧪 TEST MODE: Processing first 100 contacts only")
        print()

    # Connect to databases
    print("📡 Connecting to databases...")

    # Connect SQLAlchemy to localhost (not RDS)
    from sqlalchemy import create_engine
    from sqlalchemy.orm import sessionmaker
    localhost_engine = create_engine('postgresql://andrewjiang@localhost:5432/telegram_contacts')
    LocalSession = sessionmaker(bind=localhost_engine)
    telegram_db = LocalSession()

    try:
        twitter_conn = psycopg2.connect(**TWITTER_DB_CONFIG)
        twitter_conn.autocommit = False
    except Exception as e:
        print(f"❌ Failed to connect to Twitter database: {e}")
        print(f"   Config: {TWITTER_DB_CONFIG}")
        return False

    # Also need psycopg2 connection to telegram DB for writing
    try:
        telegram_conn = psycopg2.connect(dbname='telegram_contacts', user='andrewjiang', host='localhost', port=5432)
        telegram_conn.autocommit = False
    except Exception as e:
        print(f"❌ Failed to connect to Telegram database: {e}")
        return False

    print("✅ Connected to both databases")
    print()

    try:
        # Initialize matcher (pass both connections for phash lookup)
        telegram_psycopg_conn = psycopg2.connect(
            dbname='telegram_contacts',
            user='andrewjiang',
            host='localhost'
        )
        matcher = TwitterMatcher(twitter_conn, telegram_psycopg_conn)

        # Get Telegram contacts with bios or usernames
        print("🔍 Loading Telegram contacts...")
        query = telegram_db.query(Contact).filter(
            Contact.user_id > 0,  # Exclude channels
            Contact.is_deleted == False,
            Contact.is_bot == False
        ).filter(
            (Contact.bio != None) | (Contact.username != None)
        ).order_by(Contact.user_id)

        if limit:
            query = query.limit(limit)

        contacts = query.all()
        print(f"✅ Found {len(contacts):,} contacts to process")
        print()

        # Process each contact
        stats = {
            'processed': 0,
            'with_candidates': 0,
            'total_candidates': 0,
            'by_method': {}
        }

        print("🚀 Finding Twitter candidates...")
        print()

        for i, contact in enumerate(contacts, 1):
            candidates = matcher.find_candidates_for_contact(contact)

            if candidates:
                stats['with_candidates'] += 1
                stats['total_candidates'] += len(candidates)

                # Track by method
                for cand in candidates:
                    method = cand['match_method']
                    stats['by_method'][method] = stats['by_method'].get(method, 0) + 1

                # Save to database
                with telegram_conn.cursor() as cur:
                    save_candidates_to_db(contact.user_id, contact.account_id, candidates, cur)
                    telegram_conn.commit()

            stats['processed'] += 1

            # Progress update every 10 contacts
            if i % 10 == 0:
                print(f"  Processed {i:,}/{len(contacts):,} contacts... (candidates: {stats['with_candidates']:,}, total: {stats['total_candidates']:,})")
            elif i == 1:
                # Show first contact immediately
                print(f"  Processed 1/{len(contacts):,} contacts... (candidates: {stats['with_candidates']:,}, total: {stats['total_candidates']:,})")

        # Final stats
        print()
        print("=" * 70)
        print("✅ CANDIDATE FINDING COMPLETE")
        print("=" * 70)
        print()
        print(f"📊 Statistics:")
        print(f"   Processed: {stats['processed']:,} contacts")
        print(f"   With candidates: {stats['with_candidates']:,} ({stats['with_candidates']/stats['processed']*100:.1f}%)")
        print(f"   Total candidates: {stats['total_candidates']:,}")
        print(f"   Avg candidates per match: {stats['total_candidates']/max(stats['with_candidates'], 1):.1f}")
        print()
        print(f"📈 By method:")
        for method, count in sorted(stats['by_method'].items(), key=lambda x: -x[1]):
            print(f"   {method}: {count:,}")
        print()

        return True

    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()
        return False

    finally:
        telegram_db.close()
        twitter_conn.close()
        telegram_conn.close()


if __name__ == "__main__":
    try:
        success = main()
        sys.exit(0 if success else 1)
    except KeyboardInterrupt:
        print("\n\n⚠️  Interrupted by user")
        sys.exit(1)