mirror of
https://github.com/lockin-bot/ProfileMatching.git
synced 2026-01-12 09:44:30 +08:00
This module provides comprehensive Twitter-to-Telegram profile matching and verification using 10 different matching methods and LLM verification. Features: - 10 matching methods (phash, usernames, bio handles, URL resolution, fuzzy names) - URL resolution integration for t.co → t.me links - Async LLM verification with GPT-5-mini - Interactive menu system with real-time stats - Threaded candidate finding (~1.5 contacts/sec) - Comprehensive documentation and guides Key Components: - find_twitter_candidates.py: Core matching logic (10 methods) - find_twitter_candidates_threaded.py: Threaded implementation - verify_twitter_matches_v2.py: LLM verification (V5 prompt) - review_match_quality.py: Analysis and quality review - main.py: Interactive menu system - Complete documentation (README, CHANGELOG, QUICKSTART) Performance: - Candidate finding: ~16-18 hours for 43K contacts - LLM verification: ~23 hours for 43K users - Cost: ~$130 for full verification 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
226 lines
6.9 KiB
Python
Executable File
226 lines
6.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Twitter-Telegram Profile Matching System
|
|
Main menu for finding candidates and verifying matches with LLM
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import subprocess
|
|
import psycopg2
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'src'))
|
|
|
|
# Database configuration
|
|
DB_CONFIG = {
|
|
'dbname': 'telegram_contacts',
|
|
'user': 'andrewjiang',
|
|
'host': 'localhost',
|
|
'port': 5432
|
|
}
|
|
|
|
TWITTER_DB_CONFIG = {
|
|
'dbname': 'twitter_data',
|
|
'user': 'andrewjiang',
|
|
'host': 'localhost',
|
|
'port': 5432
|
|
}
|
|
|
|
|
|
def get_stats():
|
|
"""Get current matching statistics"""
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|
cur = conn.cursor()
|
|
|
|
stats = {}
|
|
|
|
# Candidates stats
|
|
cur.execute("""
|
|
SELECT
|
|
COUNT(DISTINCT telegram_user_id) as total_users,
|
|
COUNT(*) as total_candidates,
|
|
COUNT(*) FILTER (WHERE llm_processed = TRUE) as processed_candidates,
|
|
COUNT(*) FILTER (WHERE llm_processed = FALSE) as pending_candidates
|
|
FROM twitter_match_candidates
|
|
""")
|
|
row = cur.fetchone()
|
|
stats['total_users'] = row[0]
|
|
stats['total_candidates'] = row[1]
|
|
stats['processed_candidates'] = row[2]
|
|
stats['pending_candidates'] = row[3]
|
|
|
|
# Matches stats
|
|
cur.execute("""
|
|
SELECT
|
|
COUNT(*) as total_matches,
|
|
AVG(final_confidence) as avg_confidence,
|
|
COUNT(*) FILTER (WHERE final_confidence >= 0.90) as high_conf,
|
|
COUNT(*) FILTER (WHERE final_confidence >= 0.80 AND final_confidence < 0.90) as med_conf,
|
|
COUNT(*) FILTER (WHERE final_confidence >= 0.70 AND final_confidence < 0.80) as low_conf
|
|
FROM twitter_telegram_matches
|
|
""")
|
|
row = cur.fetchone()
|
|
stats['total_matches'] = row[0]
|
|
stats['avg_confidence'] = row[1] or 0
|
|
stats['high_conf'] = row[2]
|
|
stats['med_conf'] = row[3]
|
|
stats['low_conf'] = row[4]
|
|
|
|
# Users with matches
|
|
cur.execute("""
|
|
SELECT COUNT(DISTINCT telegram_user_id)
|
|
FROM twitter_telegram_matches
|
|
""")
|
|
stats['users_with_matches'] = cur.fetchone()[0]
|
|
|
|
cur.close()
|
|
conn.close()
|
|
|
|
return stats
|
|
|
|
|
|
def print_header():
|
|
"""Print main header"""
|
|
print()
|
|
print("=" * 80)
|
|
print("🔗 Twitter-Telegram Profile Matching System")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
|
|
def print_stats():
|
|
"""Print current statistics"""
|
|
stats = get_stats()
|
|
|
|
print("📊 Current Statistics:")
|
|
print("-" * 80)
|
|
print(f"Candidates:")
|
|
print(f" • Users with candidates: {stats['total_users']:,}")
|
|
print(f" • Total candidates found: {stats['total_candidates']:,}")
|
|
print(f" • Processed by LLM: {stats['processed_candidates']:,}")
|
|
print(f" • Pending verification: {stats['pending_candidates']:,}")
|
|
print()
|
|
print(f"Verified Matches:")
|
|
print(f" • Users with matches: {stats['users_with_matches']:,}")
|
|
print(f" • Total matches: {stats['total_matches']:,}")
|
|
print(f" • Average confidence: {stats['avg_confidence']:.2f}")
|
|
print(f" • High confidence (90%+): {stats['high_conf']:,}")
|
|
print(f" • Medium confidence (80-89%): {stats['med_conf']:,}")
|
|
print(f" • Low confidence (70-79%): {stats['low_conf']:,}")
|
|
print("-" * 80)
|
|
print()
|
|
|
|
|
|
def run_script(script_name, *args):
|
|
"""Run a Python script with arguments"""
|
|
script_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"{script_name}.py")
|
|
cmd = ['python3.10', script_path] + list(args)
|
|
subprocess.run(cmd)
|
|
|
|
|
|
def main():
|
|
while True:
|
|
print_header()
|
|
print_stats()
|
|
|
|
print("📋 Main Menu:")
|
|
print()
|
|
print("STEP 1: Find Candidates")
|
|
print(" 1. Find Twitter candidates (threaded, RECOMMENDED)")
|
|
print(" 2. Find Twitter candidates (single-threaded)")
|
|
print()
|
|
print("STEP 2: Verify with LLM")
|
|
print(" 3. Verify matches with LLM (async, RECOMMENDED)")
|
|
print(" 4. Verify matches with LLM (test mode - 50 users)")
|
|
print()
|
|
print("Analysis & Review")
|
|
print(" 5. Review match quality")
|
|
print(" 6. Show statistics only")
|
|
print()
|
|
print(" 0. Exit")
|
|
print()
|
|
|
|
choice = input("👉 Enter your choice: ").strip()
|
|
|
|
if choice == '0':
|
|
print("\n👋 Goodbye!\n")
|
|
break
|
|
|
|
elif choice == '1':
|
|
# Find candidates (threaded)
|
|
print()
|
|
print("🔍 Finding Twitter candidates (threaded mode)...")
|
|
print()
|
|
limit_input = input("👉 How many contacts? (press Enter for all): ").strip()
|
|
workers = input("👉 Number of worker threads (default: 8): ").strip() or '8'
|
|
|
|
if limit_input:
|
|
run_script('find_twitter_candidates_threaded', '--limit', limit_input, '--workers', workers)
|
|
else:
|
|
run_script('find_twitter_candidates_threaded', '--workers', workers)
|
|
|
|
input("\n✅ Press Enter to continue...")
|
|
|
|
elif choice == '2':
|
|
# Find candidates (single-threaded)
|
|
print()
|
|
print("🔍 Finding Twitter candidates (single-threaded mode)...")
|
|
print()
|
|
limit_input = input("👉 How many contacts? (press Enter for all): ").strip()
|
|
|
|
if limit_input:
|
|
run_script('find_twitter_candidates', '--limit', limit_input)
|
|
else:
|
|
run_script('find_twitter_candidates')
|
|
|
|
input("\n✅ Press Enter to continue...")
|
|
|
|
elif choice == '3':
|
|
# Verify with LLM (async)
|
|
print()
|
|
print("🤖 Verifying matches with LLM (async mode)...")
|
|
print()
|
|
concurrent = input("👉 Concurrent requests (default: 100): ").strip() or '100'
|
|
|
|
run_script('verify_twitter_matches_v2', '--verbose', '--concurrent', concurrent)
|
|
|
|
input("\n✅ Press Enter to continue...")
|
|
|
|
elif choice == '4':
|
|
# Verify with LLM (test mode)
|
|
print()
|
|
print("🧪 Test mode: Verifying 50 users with LLM...")
|
|
print()
|
|
|
|
run_script('verify_twitter_matches_v2', '--test', '--limit', '50', '--verbose', '--concurrent', '10')
|
|
|
|
input("\n✅ Press Enter to continue...")
|
|
|
|
elif choice == '5':
|
|
# Review match quality
|
|
print()
|
|
print("📊 Reviewing match quality...")
|
|
print()
|
|
|
|
run_script('review_match_quality')
|
|
|
|
input("\n✅ Press Enter to continue...")
|
|
|
|
elif choice == '6':
|
|
# Just show stats, loop back to menu
|
|
continue
|
|
|
|
else:
|
|
print("\n❌ Invalid choice. Please try again.\n")
|
|
input("Press Enter to continue...")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
main()
|
|
except KeyboardInterrupt:
|
|
print("\n\n👋 Interrupted. Goodbye!\n")
|
|
sys.exit(0)
|