Initial commit: Twitter-Telegram Profile Matching System

This module provides comprehensive Twitter-to-Telegram profile matching
and verification using 10 different matching methods and LLM verification.

Features:
- 10 matching methods (phash, usernames, bio handles, URL resolution, fuzzy names)
- URL resolution integration for t.co → t.me links
- Async LLM verification with GPT-5-mini
- Interactive menu system with real-time stats
- Threaded candidate finding (~1.5 contacts/sec)
- Comprehensive documentation and guides

Key Components:
- find_twitter_candidates.py: Core matching logic (10 methods)
- find_twitter_candidates_threaded.py: Threaded implementation
- verify_twitter_matches_v2.py: LLM verification (V5 prompt)
- review_match_quality.py: Analysis and quality review
- main.py: Interactive menu system
- Complete documentation (README, CHANGELOG, QUICKSTART)

Performance:
- Candidate finding: ~16-18 hours for 43K contacts
- LLM verification: ~23 hours for 43K users
- Cost: ~$130 for full verification

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Andrew Jiang
2025-11-04 22:56:25 -08:00
commit 5319d4d868
10 changed files with 3394 additions and 0 deletions

View File

@@ -0,0 +1,101 @@
-- Twitter-Telegram Matching Schema
-- Run this against your telegram_contacts database
-- Table: twitter_telegram_matches
-- Stores confirmed matches between Telegram and Twitter profiles
CREATE TABLE IF NOT EXISTS twitter_telegram_matches (
id SERIAL PRIMARY KEY,
-- Telegram side
telegram_user_id BIGINT NOT NULL REFERENCES contacts(user_id),
telegram_username VARCHAR(255),
telegram_name VARCHAR(255),
telegram_bio TEXT,
-- Twitter side
twitter_id VARCHAR(50) NOT NULL,
twitter_username VARCHAR(100) NOT NULL,
twitter_name VARCHAR(200),
twitter_bio TEXT,
twitter_location VARCHAR(200),
twitter_verified BOOLEAN,
twitter_blue_verified BOOLEAN,
twitter_followers_count INTEGER,
-- Matching metadata
match_method VARCHAR(50) NOT NULL, -- 'exact_bio_handle', 'exact_username', 'username_variation', 'fuzzy_name'
baseline_confidence FLOAT NOT NULL, -- Confidence before LLM (0-1)
llm_verdict VARCHAR(20) NOT NULL, -- 'YES', 'NO', 'UNSURE'
final_confidence FLOAT NOT NULL CHECK (final_confidence BETWEEN 0 AND 1),
-- Match details (JSON for flexibility)
match_details JSONB, -- {extracted_handles: [...], username_variation: 'xxx', fuzzy_score: 0.85}
-- LLM metadata
llm_tokens_used INTEGER,
llm_cost FLOAT,
-- Audit trail
matched_at TIMESTAMP DEFAULT NOW(),
needs_manual_review BOOLEAN DEFAULT FALSE,
verified_manually BOOLEAN DEFAULT FALSE,
manual_review_notes TEXT,
UNIQUE(telegram_user_id, twitter_id)
);
-- Indexes for performance
CREATE INDEX IF NOT EXISTS idx_ttm_telegram_user ON twitter_telegram_matches(telegram_user_id);
CREATE INDEX IF NOT EXISTS idx_ttm_twitter_id ON twitter_telegram_matches(twitter_id);
CREATE INDEX IF NOT EXISTS idx_ttm_twitter_username ON twitter_telegram_matches(LOWER(twitter_username));
CREATE INDEX IF NOT EXISTS idx_ttm_confidence ON twitter_telegram_matches(final_confidence DESC);
CREATE INDEX IF NOT EXISTS idx_ttm_verdict ON twitter_telegram_matches(llm_verdict);
CREATE INDEX IF NOT EXISTS idx_ttm_method ON twitter_telegram_matches(match_method);
CREATE INDEX IF NOT EXISTS idx_ttm_needs_review ON twitter_telegram_matches(needs_manual_review) WHERE needs_manual_review = TRUE;
-- Table: twitter_match_candidates (temporary staging)
-- Stores potential matches before LLM verification
CREATE TABLE IF NOT EXISTS twitter_match_candidates (
id SERIAL PRIMARY KEY,
telegram_user_id BIGINT NOT NULL REFERENCES contacts(user_id),
-- Twitter candidate info
twitter_id VARCHAR(50) NOT NULL,
twitter_username VARCHAR(100) NOT NULL,
twitter_name VARCHAR(200),
twitter_bio TEXT,
twitter_location VARCHAR(200),
twitter_verified BOOLEAN,
twitter_blue_verified BOOLEAN,
twitter_followers_count INTEGER,
-- Candidate scoring
candidate_rank INTEGER, -- 1 = best match, 2 = second best, etc.
match_method VARCHAR(50),
baseline_confidence FLOAT,
match_signals JSONB, -- {'handle_match': true, 'fuzzy_score': 0.85, ...}
-- LLM processing status
needs_llm_review BOOLEAN DEFAULT TRUE,
llm_processed BOOLEAN DEFAULT FALSE,
llm_verdict VARCHAR(20),
final_confidence FLOAT,
created_at TIMESTAMP DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_tmc_telegram_user ON twitter_match_candidates(telegram_user_id);
CREATE INDEX IF NOT EXISTS idx_tmc_twitter_id ON twitter_match_candidates(twitter_id);
CREATE INDEX IF NOT EXISTS idx_tmc_needs_review ON twitter_match_candidates(llm_processed, needs_llm_review)
WHERE needs_llm_review = TRUE AND llm_processed = FALSE;
CREATE INDEX IF NOT EXISTS idx_tmc_rank ON twitter_match_candidates(telegram_user_id, candidate_rank);
-- Grant permissions (adjust as needed for your setup)
-- GRANT ALL PRIVILEGES ON twitter_telegram_matches TO your_user;
-- GRANT ALL PRIVILEGES ON twitter_match_candidates TO your_user;
-- GRANT USAGE, SELECT ON SEQUENCE twitter_telegram_matches_id_seq TO your_user;
-- GRANT USAGE, SELECT ON SEQUENCE twitter_match_candidates_id_seq TO your_user;
COMMENT ON TABLE twitter_telegram_matches IS 'Confirmed matches between Telegram and Twitter profiles';
COMMENT ON TABLE twitter_match_candidates IS 'Temporary staging for potential matches awaiting LLM verification';