Files
ProfileMatching/main.py
Andrew Jiang 5319d4d868 Initial commit: Twitter-Telegram Profile Matching System
This module provides comprehensive Twitter-to-Telegram profile matching
and verification using 10 different matching methods and LLM verification.

Features:
- 10 matching methods (phash, usernames, bio handles, URL resolution, fuzzy names)
- URL resolution integration for t.co → t.me links
- Async LLM verification with GPT-5-mini
- Interactive menu system with real-time stats
- Threaded candidate finding (~1.5 contacts/sec)
- Comprehensive documentation and guides

Key Components:
- find_twitter_candidates.py: Core matching logic (10 methods)
- find_twitter_candidates_threaded.py: Threaded implementation
- verify_twitter_matches_v2.py: LLM verification (V5 prompt)
- review_match_quality.py: Analysis and quality review
- main.py: Interactive menu system
- Complete documentation (README, CHANGELOG, QUICKSTART)

Performance:
- Candidate finding: ~16-18 hours for 43K contacts
- LLM verification: ~23 hours for 43K users
- Cost: ~$130 for full verification

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-04 22:56:25 -08:00

226 lines
6.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Twitter-Telegram Profile Matching System
Main menu for finding candidates and verifying matches with LLM
"""
import sys
import os
import subprocess
import psycopg2
# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'src'))
# Database configuration
DB_CONFIG = {
'dbname': 'telegram_contacts',
'user': 'andrewjiang',
'host': 'localhost',
'port': 5432
}
TWITTER_DB_CONFIG = {
'dbname': 'twitter_data',
'user': 'andrewjiang',
'host': 'localhost',
'port': 5432
}
def get_stats():
"""Get current matching statistics"""
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor()
stats = {}
# Candidates stats
cur.execute("""
SELECT
COUNT(DISTINCT telegram_user_id) as total_users,
COUNT(*) as total_candidates,
COUNT(*) FILTER (WHERE llm_processed = TRUE) as processed_candidates,
COUNT(*) FILTER (WHERE llm_processed = FALSE) as pending_candidates
FROM twitter_match_candidates
""")
row = cur.fetchone()
stats['total_users'] = row[0]
stats['total_candidates'] = row[1]
stats['processed_candidates'] = row[2]
stats['pending_candidates'] = row[3]
# Matches stats
cur.execute("""
SELECT
COUNT(*) as total_matches,
AVG(final_confidence) as avg_confidence,
COUNT(*) FILTER (WHERE final_confidence >= 0.90) as high_conf,
COUNT(*) FILTER (WHERE final_confidence >= 0.80 AND final_confidence < 0.90) as med_conf,
COUNT(*) FILTER (WHERE final_confidence >= 0.70 AND final_confidence < 0.80) as low_conf
FROM twitter_telegram_matches
""")
row = cur.fetchone()
stats['total_matches'] = row[0]
stats['avg_confidence'] = row[1] or 0
stats['high_conf'] = row[2]
stats['med_conf'] = row[3]
stats['low_conf'] = row[4]
# Users with matches
cur.execute("""
SELECT COUNT(DISTINCT telegram_user_id)
FROM twitter_telegram_matches
""")
stats['users_with_matches'] = cur.fetchone()[0]
cur.close()
conn.close()
return stats
def print_header():
"""Print main header"""
print()
print("=" * 80)
print("🔗 Twitter-Telegram Profile Matching System")
print("=" * 80)
print()
def print_stats():
"""Print current statistics"""
stats = get_stats()
print("📊 Current Statistics:")
print("-" * 80)
print(f"Candidates:")
print(f" • Users with candidates: {stats['total_users']:,}")
print(f" • Total candidates found: {stats['total_candidates']:,}")
print(f" • Processed by LLM: {stats['processed_candidates']:,}")
print(f" • Pending verification: {stats['pending_candidates']:,}")
print()
print(f"Verified Matches:")
print(f" • Users with matches: {stats['users_with_matches']:,}")
print(f" • Total matches: {stats['total_matches']:,}")
print(f" • Average confidence: {stats['avg_confidence']:.2f}")
print(f" • High confidence (90%+): {stats['high_conf']:,}")
print(f" • Medium confidence (80-89%): {stats['med_conf']:,}")
print(f" • Low confidence (70-79%): {stats['low_conf']:,}")
print("-" * 80)
print()
def run_script(script_name, *args):
"""Run a Python script with arguments"""
script_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"{script_name}.py")
cmd = ['python3.10', script_path] + list(args)
subprocess.run(cmd)
def main():
while True:
print_header()
print_stats()
print("📋 Main Menu:")
print()
print("STEP 1: Find Candidates")
print(" 1. Find Twitter candidates (threaded, RECOMMENDED)")
print(" 2. Find Twitter candidates (single-threaded)")
print()
print("STEP 2: Verify with LLM")
print(" 3. Verify matches with LLM (async, RECOMMENDED)")
print(" 4. Verify matches with LLM (test mode - 50 users)")
print()
print("Analysis & Review")
print(" 5. Review match quality")
print(" 6. Show statistics only")
print()
print(" 0. Exit")
print()
choice = input("👉 Enter your choice: ").strip()
if choice == '0':
print("\n👋 Goodbye!\n")
break
elif choice == '1':
# Find candidates (threaded)
print()
print("🔍 Finding Twitter candidates (threaded mode)...")
print()
limit_input = input("👉 How many contacts? (press Enter for all): ").strip()
workers = input("👉 Number of worker threads (default: 8): ").strip() or '8'
if limit_input:
run_script('find_twitter_candidates_threaded', '--limit', limit_input, '--workers', workers)
else:
run_script('find_twitter_candidates_threaded', '--workers', workers)
input("\n✅ Press Enter to continue...")
elif choice == '2':
# Find candidates (single-threaded)
print()
print("🔍 Finding Twitter candidates (single-threaded mode)...")
print()
limit_input = input("👉 How many contacts? (press Enter for all): ").strip()
if limit_input:
run_script('find_twitter_candidates', '--limit', limit_input)
else:
run_script('find_twitter_candidates')
input("\n✅ Press Enter to continue...")
elif choice == '3':
# Verify with LLM (async)
print()
print("🤖 Verifying matches with LLM (async mode)...")
print()
concurrent = input("👉 Concurrent requests (default: 100): ").strip() or '100'
run_script('verify_twitter_matches_v2', '--verbose', '--concurrent', concurrent)
input("\n✅ Press Enter to continue...")
elif choice == '4':
# Verify with LLM (test mode)
print()
print("🧪 Test mode: Verifying 50 users with LLM...")
print()
run_script('verify_twitter_matches_v2', '--test', '--limit', '50', '--verbose', '--concurrent', '10')
input("\n✅ Press Enter to continue...")
elif choice == '5':
# Review match quality
print()
print("📊 Reviewing match quality...")
print()
run_script('review_match_quality')
input("\n✅ Press Enter to continue...")
elif choice == '6':
# Just show stats, loop back to menu
continue
else:
print("\n❌ Invalid choice. Please try again.\n")
input("Press Enter to continue...")
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("\n\n👋 Interrupted. Goodbye!\n")
sys.exit(0)