mirror of
https://github.com/lockin-bot/ProfileMatching.git
synced 2026-01-12 09:44:30 +08:00
This module provides comprehensive Twitter-to-Telegram profile matching and verification using 10 different matching methods and LLM verification. Features: - 10 matching methods (phash, usernames, bio handles, URL resolution, fuzzy names) - URL resolution integration for t.co → t.me links - Async LLM verification with GPT-5-mini - Interactive menu system with real-time stats - Threaded candidate finding (~1.5 contacts/sec) - Comprehensive documentation and guides Key Components: - find_twitter_candidates.py: Core matching logic (10 methods) - find_twitter_candidates_threaded.py: Threaded implementation - verify_twitter_matches_v2.py: LLM verification (V5 prompt) - review_match_quality.py: Analysis and quality review - main.py: Interactive menu system - Complete documentation (README, CHANGELOG, QUICKSTART) Performance: - Candidate finding: ~16-18 hours for 43K contacts - LLM verification: ~23 hours for 43K users - Cost: ~$130 for full verification 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
102 lines
4.2 KiB
SQL
102 lines
4.2 KiB
SQL
-- Twitter-Telegram Matching Schema
|
|
-- Run this against your telegram_contacts database
|
|
|
|
-- Table: twitter_telegram_matches
|
|
-- Stores confirmed matches between Telegram and Twitter profiles
|
|
CREATE TABLE IF NOT EXISTS twitter_telegram_matches (
|
|
id SERIAL PRIMARY KEY,
|
|
|
|
-- Telegram side
|
|
telegram_user_id BIGINT NOT NULL REFERENCES contacts(user_id),
|
|
telegram_username VARCHAR(255),
|
|
telegram_name VARCHAR(255),
|
|
telegram_bio TEXT,
|
|
|
|
-- Twitter side
|
|
twitter_id VARCHAR(50) NOT NULL,
|
|
twitter_username VARCHAR(100) NOT NULL,
|
|
twitter_name VARCHAR(200),
|
|
twitter_bio TEXT,
|
|
twitter_location VARCHAR(200),
|
|
twitter_verified BOOLEAN,
|
|
twitter_blue_verified BOOLEAN,
|
|
twitter_followers_count INTEGER,
|
|
|
|
-- Matching metadata
|
|
match_method VARCHAR(50) NOT NULL, -- 'exact_bio_handle', 'exact_username', 'username_variation', 'fuzzy_name'
|
|
baseline_confidence FLOAT NOT NULL, -- Confidence before LLM (0-1)
|
|
llm_verdict VARCHAR(20) NOT NULL, -- 'YES', 'NO', 'UNSURE'
|
|
final_confidence FLOAT NOT NULL CHECK (final_confidence BETWEEN 0 AND 1),
|
|
|
|
-- Match details (JSON for flexibility)
|
|
match_details JSONB, -- {extracted_handles: [...], username_variation: 'xxx', fuzzy_score: 0.85}
|
|
|
|
-- LLM metadata
|
|
llm_tokens_used INTEGER,
|
|
llm_cost FLOAT,
|
|
|
|
-- Audit trail
|
|
matched_at TIMESTAMP DEFAULT NOW(),
|
|
needs_manual_review BOOLEAN DEFAULT FALSE,
|
|
verified_manually BOOLEAN DEFAULT FALSE,
|
|
manual_review_notes TEXT,
|
|
|
|
UNIQUE(telegram_user_id, twitter_id)
|
|
);
|
|
|
|
-- Indexes for performance
|
|
CREATE INDEX IF NOT EXISTS idx_ttm_telegram_user ON twitter_telegram_matches(telegram_user_id);
|
|
CREATE INDEX IF NOT EXISTS idx_ttm_twitter_id ON twitter_telegram_matches(twitter_id);
|
|
CREATE INDEX IF NOT EXISTS idx_ttm_twitter_username ON twitter_telegram_matches(LOWER(twitter_username));
|
|
CREATE INDEX IF NOT EXISTS idx_ttm_confidence ON twitter_telegram_matches(final_confidence DESC);
|
|
CREATE INDEX IF NOT EXISTS idx_ttm_verdict ON twitter_telegram_matches(llm_verdict);
|
|
CREATE INDEX IF NOT EXISTS idx_ttm_method ON twitter_telegram_matches(match_method);
|
|
CREATE INDEX IF NOT EXISTS idx_ttm_needs_review ON twitter_telegram_matches(needs_manual_review) WHERE needs_manual_review = TRUE;
|
|
|
|
-- Table: twitter_match_candidates (temporary staging)
|
|
-- Stores potential matches before LLM verification
|
|
CREATE TABLE IF NOT EXISTS twitter_match_candidates (
|
|
id SERIAL PRIMARY KEY,
|
|
|
|
telegram_user_id BIGINT NOT NULL REFERENCES contacts(user_id),
|
|
|
|
-- Twitter candidate info
|
|
twitter_id VARCHAR(50) NOT NULL,
|
|
twitter_username VARCHAR(100) NOT NULL,
|
|
twitter_name VARCHAR(200),
|
|
twitter_bio TEXT,
|
|
twitter_location VARCHAR(200),
|
|
twitter_verified BOOLEAN,
|
|
twitter_blue_verified BOOLEAN,
|
|
twitter_followers_count INTEGER,
|
|
|
|
-- Candidate scoring
|
|
candidate_rank INTEGER, -- 1 = best match, 2 = second best, etc.
|
|
match_method VARCHAR(50),
|
|
baseline_confidence FLOAT,
|
|
match_signals JSONB, -- {'handle_match': true, 'fuzzy_score': 0.85, ...}
|
|
|
|
-- LLM processing status
|
|
needs_llm_review BOOLEAN DEFAULT TRUE,
|
|
llm_processed BOOLEAN DEFAULT FALSE,
|
|
llm_verdict VARCHAR(20),
|
|
final_confidence FLOAT,
|
|
|
|
created_at TIMESTAMP DEFAULT NOW()
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_tmc_telegram_user ON twitter_match_candidates(telegram_user_id);
|
|
CREATE INDEX IF NOT EXISTS idx_tmc_twitter_id ON twitter_match_candidates(twitter_id);
|
|
CREATE INDEX IF NOT EXISTS idx_tmc_needs_review ON twitter_match_candidates(llm_processed, needs_llm_review)
|
|
WHERE needs_llm_review = TRUE AND llm_processed = FALSE;
|
|
CREATE INDEX IF NOT EXISTS idx_tmc_rank ON twitter_match_candidates(telegram_user_id, candidate_rank);
|
|
|
|
-- Grant permissions (adjust as needed for your setup)
|
|
-- GRANT ALL PRIVILEGES ON twitter_telegram_matches TO your_user;
|
|
-- GRANT ALL PRIVILEGES ON twitter_match_candidates TO your_user;
|
|
-- GRANT USAGE, SELECT ON SEQUENCE twitter_telegram_matches_id_seq TO your_user;
|
|
-- GRANT USAGE, SELECT ON SEQUENCE twitter_match_candidates_id_seq TO your_user;
|
|
|
|
COMMENT ON TABLE twitter_telegram_matches IS 'Confirmed matches between Telegram and Twitter profiles';
|
|
COMMENT ON TABLE twitter_match_candidates IS 'Temporary staging for potential matches awaiting LLM verification';
|