From ed47c1557915bb2472f6959e723cd76155312a98 Mon Sep 17 00:00:00 2001 From: Chris Xiong Date: Mon, 6 Apr 2020 00:50:58 +0800 Subject: Add deduper (unfinished tool for finding image duplicates). --- deduper/libpuzzle/php/examples/similar/Makefile.am | 6 + .../libpuzzle/php/examples/similar/config.inc.php | 9 + .../php/examples/similar/schema.pgsql.sql | 230 +++++++++++++++++++++ .../php/examples/similar/schema.sqlite3.sql | 23 +++ .../libpuzzle/php/examples/similar/similar.inc.php | 120 +++++++++++ deduper/libpuzzle/php/examples/similar/similar.php | 158 ++++++++++++++ 6 files changed, 546 insertions(+) create mode 100644 deduper/libpuzzle/php/examples/similar/Makefile.am create mode 100644 deduper/libpuzzle/php/examples/similar/config.inc.php create mode 100644 deduper/libpuzzle/php/examples/similar/schema.pgsql.sql create mode 100644 deduper/libpuzzle/php/examples/similar/schema.sqlite3.sql create mode 100644 deduper/libpuzzle/php/examples/similar/similar.inc.php create mode 100644 deduper/libpuzzle/php/examples/similar/similar.php (limited to 'deduper/libpuzzle/php/examples/similar') diff --git a/deduper/libpuzzle/php/examples/similar/Makefile.am b/deduper/libpuzzle/php/examples/similar/Makefile.am new file mode 100644 index 0000000..126f6df --- /dev/null +++ b/deduper/libpuzzle/php/examples/similar/Makefile.am @@ -0,0 +1,6 @@ +EXTRA_DIST = \ + schema.sqlite3.sql \ + schema.pgsql.sql \ + similar.php \ + similar.inc.php \ + config.inc.php diff --git a/deduper/libpuzzle/php/examples/similar/config.inc.php b/deduper/libpuzzle/php/examples/similar/config.inc.php new file mode 100644 index 0000000..d4e3b41 --- /dev/null +++ b/deduper/libpuzzle/php/examples/similar/config.inc.php @@ -0,0 +1,9 @@ + diff --git a/deduper/libpuzzle/php/examples/similar/schema.pgsql.sql b/deduper/libpuzzle/php/examples/similar/schema.pgsql.sql new file mode 100644 index 0000000..7dc6bc1 --- /dev/null +++ b/deduper/libpuzzle/php/examples/similar/schema.pgsql.sql @@ -0,0 +1,230 @@ +-- +-- PostgreSQL database dump +-- + +SET client_encoding = 'UTF8'; +SET standard_conforming_strings = off; +SET check_function_bodies = false; +SET client_min_messages = warning; +SET escape_string_warning = off; + +SET SESSION AUTHORIZATION 'similar'; + +-- +-- Name: SCHEMA public; Type: COMMENT; Schema: -; Owner: similar +-- + +COMMENT ON SCHEMA public IS 'Standard public schema'; + + +SET search_path = public, pg_catalog; + +SET default_tablespace = ''; + +SET default_with_oids = false; + +-- +-- Name: pictures; Type: TABLE; Schema: public; Owner: similar; Tablespace: +-- + +CREATE TABLE pictures ( + id integer NOT NULL, + digest character(32) NOT NULL, + CONSTRAINT ck_digest CHECK ((char_length(digest) = 32)) +); + + +-- +-- Name: pictures_id_seq; Type: SEQUENCE; Schema: public; Owner: similar +-- + +CREATE SEQUENCE pictures_id_seq + START WITH 1 + INCREMENT BY 1 + NO MAXVALUE + NO MINVALUE + CACHE 1; + + +-- +-- Name: pictures_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: similar +-- + +ALTER SEQUENCE pictures_id_seq OWNED BY pictures.id; + + +-- +-- Name: sentpictures; Type: TABLE; Schema: public; Owner: similar; Tablespace: +-- + +CREATE TABLE sentpictures ( + id integer NOT NULL, + url character varying(255) NOT NULL, + sender character varying(100) NOT NULL, + picture_id integer NOT NULL, + CONSTRAINT ck_url CHECK (((url)::text <> ''::text)) +); + + +-- +-- Name: sentpictures_id_seq; Type: SEQUENCE; Schema: public; Owner: similar +-- + +CREATE SEQUENCE sentpictures_id_seq + START WITH 1 + INCREMENT BY 1 + NO MAXVALUE + NO MINVALUE + CACHE 1; + + +-- +-- Name: sentpictures_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: similar +-- + +ALTER SEQUENCE sentpictures_id_seq OWNED BY sentpictures.id; + + +-- +-- Name: signatures; Type: TABLE; Schema: public; Owner: similar; Tablespace: +-- + +CREATE TABLE signatures ( + id integer NOT NULL, + compressed_signature bytea NOT NULL, + picture_id integer NOT NULL, + CONSTRAINT ck_signature CHECK ((octet_length(compressed_signature) >= 182)) +); + + +-- +-- Name: signatures_id_seq; Type: SEQUENCE; Schema: public; Owner: similar +-- + +CREATE SEQUENCE signatures_id_seq + START WITH 1 + INCREMENT BY 1 + NO MAXVALUE + NO MINVALUE + CACHE 1; + + +-- +-- Name: signatures_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: similar +-- + +ALTER SEQUENCE signatures_id_seq OWNED BY signatures.id; + + +-- +-- Name: words; Type: TABLE; Schema: public; Owner: similar; Tablespace: +-- + +CREATE TABLE words ( + pos_and_word bytea NOT NULL, + signature_id integer NOT NULL, + CONSTRAINT ck_pos_and_word CHECK ((octet_length(pos_and_word) >= 2)) +); + + +-- +-- Name: id; Type: DEFAULT; Schema: public; Owner: similar +-- + +ALTER TABLE pictures ALTER COLUMN id SET DEFAULT nextval('pictures_id_seq'::regclass); + + +-- +-- Name: id; Type: DEFAULT; Schema: public; Owner: similar +-- + +ALTER TABLE sentpictures ALTER COLUMN id SET DEFAULT nextval('sentpictures_id_seq'::regclass); + + +-- +-- Name: id; Type: DEFAULT; Schema: public; Owner: similar +-- + +ALTER TABLE signatures ALTER COLUMN id SET DEFAULT nextval('signatures_id_seq'::regclass); + + +-- +-- Name: pictures_pkey; Type: CONSTRAINT; Schema: public; Owner: similar; Tablespace: +-- + +ALTER TABLE ONLY pictures + ADD CONSTRAINT pictures_pkey PRIMARY KEY (id); + + +-- +-- Name: sentpictures_pkey; Type: CONSTRAINT; Schema: public; Owner: similar; Tablespace: +-- + +ALTER TABLE ONLY sentpictures + ADD CONSTRAINT sentpictures_pkey PRIMARY KEY (id); + + +-- +-- Name: signatures_pkey; Type: CONSTRAINT; Schema: public; Owner: similar; Tablespace: +-- + +ALTER TABLE ONLY signatures + ADD CONSTRAINT signatures_pkey PRIMARY KEY (id); + + +-- +-- Name: idx_digest; Type: INDEX; Schema: public; Owner: similar; Tablespace: +-- + +CREATE UNIQUE INDEX idx_digest ON pictures USING btree (digest); + + +-- +-- Name: idx_picture_id; Type: INDEX; Schema: public; Owner: similar; Tablespace: +-- + +CREATE INDEX idx_picture_id ON sentpictures USING btree (picture_id); + + +-- +-- Name: idx_pos_and_word; Type: INDEX; Schema: public; Owner: similar; Tablespace: +-- + +CREATE INDEX idx_pos_and_word ON words USING btree (pos_and_word); + + +-- +-- Name: idx_url; Type: INDEX; Schema: public; Owner: similar; Tablespace: +-- + +CREATE UNIQUE INDEX idx_url ON sentpictures USING btree (url); + + +-- +-- Name: sentpictures_picture_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: similar +-- + +ALTER TABLE ONLY sentpictures + ADD CONSTRAINT sentpictures_picture_id_fkey FOREIGN KEY (picture_id) REFERENCES pictures(id) ON UPDATE CASCADE ON DELETE CASCADE; + + +-- +-- Name: signatures_picture_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: similar +-- + +ALTER TABLE ONLY signatures + ADD CONSTRAINT signatures_picture_id_fkey FOREIGN KEY (picture_id) REFERENCES pictures(id) ON UPDATE CASCADE ON DELETE CASCADE; + + +-- +-- Name: words_signature_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: similar +-- + +ALTER TABLE ONLY words + ADD CONSTRAINT words_signature_id_fkey FOREIGN KEY (signature_id) REFERENCES signatures(id) ON UPDATE CASCADE ON DELETE CASCADE; + + +-- +-- PostgreSQL database dump complete +-- + diff --git a/deduper/libpuzzle/php/examples/similar/schema.sqlite3.sql b/deduper/libpuzzle/php/examples/similar/schema.sqlite3.sql new file mode 100644 index 0000000..dc5a6c3 --- /dev/null +++ b/deduper/libpuzzle/php/examples/similar/schema.sqlite3.sql @@ -0,0 +1,23 @@ +CREATE TABLE pictures ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + digest CHAR(32) NOT NULL +); +CREATE TABLE sentpictures ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url VARCHAR(255) NOT NULL, + sender VARCHAR(100) NOT NULL, + picture_id INTEGER NOT NULL +); +CREATE TABLE signatures ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + compressed_signature CHAR(182) NOT NULL, + picture_id INTEGER NOT NULL +); +CREATE TABLE words ( + pos_and_word CHAR(5) NOT NULL, + signature_id INTEGER NOT NULL +); +CREATE UNIQUE INDEX idx_digest ON pictures(digest); +CREATE INDEX idx_picture_id ON sentpictures (picture_id); +CREATE INDEX idx_pos_and_word ON words(pos_and_word); +CREATE UNIQUE INDEX idx_url ON sentpictures (url); diff --git a/deduper/libpuzzle/php/examples/similar/similar.inc.php b/deduper/libpuzzle/php/examples/similar/similar.inc.php new file mode 100644 index 0000000..cfc806e --- /dev/null +++ b/deduper/libpuzzle/php/examples/similar/similar.inc.php @@ -0,0 +1,120 @@ +beginTransaction(); + try { + $st = $dbh->prepare + ('DELETE FROM sentpictures WHERE url = :url'); + $st->execute(array(':url' => $url)); + $st = $dbh->prepare + ('SELECT id FROM pictures WHERE digest = :digest'); + $st->execute(array(':digest' => $md5)); + $picture_id = $st->fetchColumn(); + $st->closeCursor(); + $duplicate = TRUE; + if ($picture_id === FALSE) { + $duplicate = FALSE; + $st = $dbh->prepare + ('INSERT INTO pictures (digest) VALUES (:digest)'); + $st->execute(array(':digest' => $md5)); + $picture_id = $dbh->lastInsertId('id'); + } + $st = $dbh->prepare + ('INSERT INTO sentpictures (url, sender, picture_id) ' . + 'VALUES (:url, :sender, :picture_id)'); + $st->execute(array(':url' => $url, ':sender' => $client_info, + ':picture_id' => $picture_id)); + if ($duplicate === TRUE) { + $dbh->commit(); + return TRUE; + } + $st = $dbh->prepare + ('INSERT INTO signatures (compressed_signature, picture_id) ' . + 'VALUES(:compressed_signature, :picture_id)'); + $st->execute(array(':compressed_signature' => $compressed_cvec, + ':picture_id' => $picture_id)); + $signature_id = $dbh->lastInsertId('id'); + $st = $dbh->prepare + ('INSERT INTO words (pos_and_word, signature_id) ' . + 'VALUES (:pos_and_word, :signature_id)'); + foreach ($words as $u => $word) { + $st->execute(array('pos_and_word' + => chr($u) . puzzle_compress_cvec($word), + 'signature_id' => $signature_id)); + } + $dbh->commit(); + } catch (Exception $e) { + var_dump($e); + $dbh->rollback(); + } + return TRUE; +} + +function find_similar_pictures($md5, $cvec, + $threshold = PUZZLE_CVEC_SIMILARITY_THRESHOLD) { + $compressed_cvec = puzzle_compress_cvec($cvec); + $words = split_into_words($cvec); + $dbh = new PDO(DB_DSN); + $dbh->beginTransaction(); + $sql = 'SELECT DISTINCT(signature_id) AS signature_id FROM words ' . + 'WHERE pos_and_word IN ('; + $coma = FALSE; + foreach ($words as $u => $word) { + if ($coma === TRUE) { + $sql .= ','; + } + $sql .= $dbh->quote(chr($u) . puzzle_compress_cvec($word)); + $coma = TRUE; + } + $sql .= ')'; + $res_words = $dbh->query($sql); + $scores = array(); + $st = $dbh->prepare('SELECT compressed_signature, picture_id ' . + 'FROM signatures WHERE id = :id'); + while (($signature_id = $res_words->fetchColumn()) !== FALSE) { + $st->execute(array(':id' => $signature_id)); + $row = $st->fetch(); + $found_compressed_signature = $row['compressed_signature']; + $picture_id = $row['picture_id']; + $found_cvec = puzzle_uncompress_cvec($found_compressed_signature); + $distance = puzzle_vector_normalized_distance($cvec, $found_cvec); + if ($distance < $threshold && $distance > 0.0) { + $scores[$picture_id] = $distance; + } + } + $sql = 'SELECT url FROM sentpictures WHERE picture_id IN ('; + $coma = FALSE; + foreach ($scores as $picture_id => $score) { + if ($coma === TRUE) { + $sql .= ','; + } + $sql .= $dbh->quote($picture_id); + $coma = TRUE; + } + $sql .= ')'; + $urls = array(); + if (!empty($scores)) { + $res_urls = $dbh->query($sql); + while (($url = $res_urls->fetchColumn()) !== FALSE) { + array_push($urls, $url); + } + } + $dbh->commit(); + + return $urls; +} + +?> diff --git a/deduper/libpuzzle/php/examples/similar/similar.php b/deduper/libpuzzle/php/examples/similar/similar.php new file mode 100644 index 0000000..4b3ad40 --- /dev/null +++ b/deduper/libpuzzle/php/examples/similar/similar.php @@ -0,0 +1,158 @@ + + + + +

Similar images finder using libpuzzle

+' . "\n"; + echo 'Enter an image URL (http only):' . "\n"; + echo '' . "\n"; + echo ''; + echo '' . "\n"; +} + +function display_error($err) { + echo '
' . htmlspecialchars($err) . '
' . "\n"; +} + +function display_loading() { + echo '
Loading...
' . "\n"; + @ob_flush(); flush(); +} + +function display_loaded() { + echo '
Loaded.
' . "\n"; + @ob_flush(); flush(); +} + +function display_signature_ok() { + echo '
Signature computed.
' . "\n"; + @ob_flush(); flush(); +} + +function remove_tmpfile($file) { + @unlink($file); +} + +function get_client_info() { + return @$_SERVER['REMOTE_ADDR'] . '/' . time(); +} + +function display_similar_pictures($urls) { + echo '
' . "\n"; + foreach ($urls as $url) { + echo ''; + echo ' '; + echo '' . "\n"; + + } + echo '
' . "\n"; +} + +function record_url($url, &$md5, &$cvec) { + if (function_exists('sys_get_temp_dir')) { + $tmpdir = sys_get_temp_dir(); + } else { + $tmpdir = '/tmp'; + } + $dfn = tempnam($tmpdir, 'similar-' . md5(uniqid(mt_rand(), TRUE))); + register_shutdown_function('remove_tmpfile', $dfn); + if (($dfp = fopen($dfn, 'w')) == FALSE) { + display_form(); + display_error('Unable to create the temporary file'); + return FALSE; + } + if (($fp = fopen($url, 'r')) == FALSE) { + display_form(); + display_error('Unable to open: [' . $url . ']'); + return FALSE; + } + $f = fread($fp, 4096); + $written = strlen($f); + if (empty($f)) { + display_form(); + display_error('Unable to load: [' . $url . ']'); + return FALSE; + } + fwrite($dfp, $f); + $infos = @getimagesize($dfn); + if (empty($infos) || + ($infos[2] !== IMAGETYPE_GIF && $infos[2] !== IMAGETYPE_JPEG && + $infos[2] !== IMAGETYPE_PNG) || + $infos[0] < 50 || $infos[1] < 50) { + fclose($dfp); + display_form(); + display_error('Unsupported image format'); + return FALSE; + } + fseek($dfp, strlen($f)); + while (!feof($fp)) { + $max = MAX_IMAGE_SIZE - $written; + if ($max > 65536) { + $max = 65536; + } + $t = fread($fp, $max); + fwrite($dfp, $t); + $written += strlen($t); + if ($written > MAX_IMAGE_SIZE) { + fclose($dfp); + display_form(); + display_error('File too large'); + return FALSE; + } + } + unset($t); + fclose($dfp); + display_loaded(); + $md5 = @md5_file($dfn); + if (empty($md5)) { + display_form(); + display_error('Unable to get the MD5 of the file'); + return FALSE; + } + $cvec = puzzle_fill_cvec_from_file($dfn); + if (empty($cvec)) { + display_form(); + display_error('Unable to compute image signature'); + return FALSE; + } + display_signature_ok(); + save_signature($url, get_client_info(), $md5, $cvec); + + return TRUE; +} + +$url = trim(@$_POST['url']); +if (empty($url)) { + display_form(); + exit(0); +} +if (strlen($url) > MAX_URL_SIZE || + preg_match('£^http://([a-z0-9-]+[.])+[a-z]{2,}/.£i', $url) <= 0) { + display_form(); + display_error('Invalid URL, must be http://...'); + exit(1); +} +display_loading(); +$md5 = FALSE; +$cvec = FALSE; +if (record_url($url, $md5, $cvec) !== TRUE) { + exit(1); +} +$urls = find_similar_pictures($md5, $cvec); +unset($cvec); +display_form(); +display_similar_pictures($urls); + +?> + + -- cgit v1.2.3