From ed47c1557915bb2472f6959e723cd76155312a98 Mon Sep 17 00:00:00 2001 From: Chris Xiong Date: Mon, 6 Apr 2020 00:50:58 +0800 Subject: Add deduper (unfinished tool for finding image duplicates). --- .gitignore | 1 + deduper/CMakeLists.txt | 20 + deduper/deduper.cpp | 195 ++++++ deduper/libpuzzle/AUTHORS | 1 + deduper/libpuzzle/COPYING | 17 + deduper/libpuzzle/ChangeLog | 0 deduper/libpuzzle/Makefile.am | 11 + deduper/libpuzzle/NEWS | 0 deduper/libpuzzle/README | 202 +++++++ deduper/libpuzzle/README-PHP | 76 +++ deduper/libpuzzle/THANKS | 6 + deduper/libpuzzle/autogen.sh | 17 + deduper/libpuzzle/composer.json | 10 + deduper/libpuzzle/configure.ac | 70 +++ deduper/libpuzzle/man/Makefile.am | 7 + deduper/libpuzzle/man/libpuzzle.3 | 296 +++++++++ deduper/libpuzzle/man/puzzle-diff.8 | 58 ++ deduper/libpuzzle/man/puzzle_set.3 | 129 ++++ deduper/libpuzzle/php/Makefile.am | 3 + deduper/libpuzzle/php/examples/Makefile.am | 2 + deduper/libpuzzle/php/examples/similar/Makefile.am | 6 + .../libpuzzle/php/examples/similar/config.inc.php | 9 + .../php/examples/similar/schema.pgsql.sql | 230 +++++++ .../php/examples/similar/schema.sqlite3.sql | 23 + .../libpuzzle/php/examples/similar/similar.inc.php | 120 ++++ deduper/libpuzzle/php/examples/similar/similar.php | 158 +++++ deduper/libpuzzle/php/libpuzzle/CREDITS | 1 + deduper/libpuzzle/php/libpuzzle/EXPERIMENTAL | 0 deduper/libpuzzle/php/libpuzzle/LICENSE | 15 + deduper/libpuzzle/php/libpuzzle/Makefile.am | 15 + deduper/libpuzzle/php/libpuzzle/README | 4 + deduper/libpuzzle/php/libpuzzle/build/Makefile.am | 0 deduper/libpuzzle/php/libpuzzle/config.m4 | 49 ++ .../libpuzzle/php/libpuzzle/include/Makefile.am | 0 deduper/libpuzzle/php/libpuzzle/libpuzzle.c | 410 +++++++++++++ deduper/libpuzzle/php/libpuzzle/libpuzzle.php | 21 + .../libpuzzle/php/libpuzzle/modules/Makefile.am | 0 deduper/libpuzzle/php/libpuzzle/php_libpuzzle.h | 66 ++ deduper/libpuzzle/php/libpuzzle/tests/001.phpt | 10 + deduper/libpuzzle/php/libpuzzle/tests/002.phpt | 15 + deduper/libpuzzle/php/libpuzzle/tests/003.phpt | 24 + deduper/libpuzzle/php/libpuzzle/tests/Makefile.am | 7 + .../libpuzzle/php/libpuzzle/tests/pics/Makefile.am | 3 + .../libpuzzle/php/libpuzzle/tests/pics/pic-a-0.jpg | Bin 0 -> 13946 bytes .../libpuzzle/php/libpuzzle/tests/pics/pic-a-1.jpg | Bin 0 -> 27407 bytes deduper/libpuzzle/src/CMakeLists.txt | 21 + deduper/libpuzzle/src/Makefile.am | 72 +++ deduper/libpuzzle/src/compress.c | 125 ++++ deduper/libpuzzle/src/cvec.c | 202 +++++++ deduper/libpuzzle/src/dvec.c | 663 +++++++++++++++++++++ deduper/libpuzzle/src/globals.h | 26 + deduper/libpuzzle/src/pics/Makefile.am | 8 + deduper/libpuzzle/src/pics/duck.gif | Bin 0 -> 7196 bytes deduper/libpuzzle/src/pics/luxmarket_tshirt01.jpg | Bin 0 -> 41128 bytes .../src/pics/luxmarket_tshirt01_black.jpg | Bin 0 -> 19800 bytes .../libpuzzle/src/pics/luxmarket_tshirt01_sal.jpg | Bin 0 -> 24646 bytes .../src/pics/luxmarket_tshirt01_sheum.jpg | Bin 0 -> 16128 bytes deduper/libpuzzle/src/pics/pic-a-0.jpg | Bin 0 -> 13946 bytes deduper/libpuzzle/src/pics/pic-a-1.jpg | Bin 0 -> 27407 bytes deduper/libpuzzle/src/puzzle-diff.c | 130 ++++ deduper/libpuzzle/src/puzzle.c | 22 + deduper/libpuzzle/src/puzzle.h | 122 ++++ deduper/libpuzzle/src/puzzle_common.h | 18 + deduper/libpuzzle/src/puzzle_p.h | 67 +++ deduper/libpuzzle/src/regress_1.c | 32 + deduper/libpuzzle/src/regress_2.c | 72 +++ deduper/libpuzzle/src/regress_3.c | 35 ++ deduper/libpuzzle/src/tunables.c | 84 +++ deduper/libpuzzle/src/vector_ops.c | 95 +++ deduper/thread_pool.h | 127 ++++ 70 files changed, 4228 insertions(+) create mode 100644 .gitignore create mode 100644 deduper/CMakeLists.txt create mode 100644 deduper/deduper.cpp create mode 100644 deduper/libpuzzle/AUTHORS create mode 100644 deduper/libpuzzle/COPYING create mode 100644 deduper/libpuzzle/ChangeLog create mode 100644 deduper/libpuzzle/Makefile.am create mode 100644 deduper/libpuzzle/NEWS create mode 100644 deduper/libpuzzle/README create mode 100644 deduper/libpuzzle/README-PHP create mode 100644 deduper/libpuzzle/THANKS create mode 100755 deduper/libpuzzle/autogen.sh create mode 100644 deduper/libpuzzle/composer.json create mode 100644 deduper/libpuzzle/configure.ac create mode 100644 deduper/libpuzzle/man/Makefile.am create mode 100644 deduper/libpuzzle/man/libpuzzle.3 create mode 100644 deduper/libpuzzle/man/puzzle-diff.8 create mode 100644 deduper/libpuzzle/man/puzzle_set.3 create mode 100644 deduper/libpuzzle/php/Makefile.am create mode 100644 deduper/libpuzzle/php/examples/Makefile.am create mode 100644 deduper/libpuzzle/php/examples/similar/Makefile.am create mode 100644 deduper/libpuzzle/php/examples/similar/config.inc.php create mode 100644 deduper/libpuzzle/php/examples/similar/schema.pgsql.sql create mode 100644 deduper/libpuzzle/php/examples/similar/schema.sqlite3.sql create mode 100644 deduper/libpuzzle/php/examples/similar/similar.inc.php create mode 100644 deduper/libpuzzle/php/examples/similar/similar.php create mode 100644 deduper/libpuzzle/php/libpuzzle/CREDITS create mode 100644 deduper/libpuzzle/php/libpuzzle/EXPERIMENTAL create mode 100644 deduper/libpuzzle/php/libpuzzle/LICENSE create mode 100644 deduper/libpuzzle/php/libpuzzle/Makefile.am create mode 100644 deduper/libpuzzle/php/libpuzzle/README create mode 100644 deduper/libpuzzle/php/libpuzzle/build/Makefile.am create mode 100644 deduper/libpuzzle/php/libpuzzle/config.m4 create mode 100644 deduper/libpuzzle/php/libpuzzle/include/Makefile.am create mode 100644 deduper/libpuzzle/php/libpuzzle/libpuzzle.c create mode 100644 deduper/libpuzzle/php/libpuzzle/libpuzzle.php create mode 100644 deduper/libpuzzle/php/libpuzzle/modules/Makefile.am create mode 100644 deduper/libpuzzle/php/libpuzzle/php_libpuzzle.h create mode 100644 deduper/libpuzzle/php/libpuzzle/tests/001.phpt create mode 100644 deduper/libpuzzle/php/libpuzzle/tests/002.phpt create mode 100644 deduper/libpuzzle/php/libpuzzle/tests/003.phpt create mode 100644 deduper/libpuzzle/php/libpuzzle/tests/Makefile.am create mode 100644 deduper/libpuzzle/php/libpuzzle/tests/pics/Makefile.am create mode 100644 deduper/libpuzzle/php/libpuzzle/tests/pics/pic-a-0.jpg create mode 100644 deduper/libpuzzle/php/libpuzzle/tests/pics/pic-a-1.jpg create mode 100644 deduper/libpuzzle/src/CMakeLists.txt create mode 100644 deduper/libpuzzle/src/Makefile.am create mode 100644 deduper/libpuzzle/src/compress.c create mode 100644 deduper/libpuzzle/src/cvec.c create mode 100644 deduper/libpuzzle/src/dvec.c create mode 100644 deduper/libpuzzle/src/globals.h create mode 100644 deduper/libpuzzle/src/pics/Makefile.am create mode 100644 deduper/libpuzzle/src/pics/duck.gif create mode 100644 deduper/libpuzzle/src/pics/luxmarket_tshirt01.jpg create mode 100644 deduper/libpuzzle/src/pics/luxmarket_tshirt01_black.jpg create mode 100644 deduper/libpuzzle/src/pics/luxmarket_tshirt01_sal.jpg create mode 100644 deduper/libpuzzle/src/pics/luxmarket_tshirt01_sheum.jpg create mode 100644 deduper/libpuzzle/src/pics/pic-a-0.jpg create mode 100644 deduper/libpuzzle/src/pics/pic-a-1.jpg create mode 100644 deduper/libpuzzle/src/puzzle-diff.c create mode 100644 deduper/libpuzzle/src/puzzle.c create mode 100644 deduper/libpuzzle/src/puzzle.h create mode 100644 deduper/libpuzzle/src/puzzle_common.h create mode 100644 deduper/libpuzzle/src/puzzle_p.h create mode 100644 deduper/libpuzzle/src/regress_1.c create mode 100644 deduper/libpuzzle/src/regress_2.c create mode 100644 deduper/libpuzzle/src/regress_3.c create mode 100644 deduper/libpuzzle/src/tunables.c create mode 100644 deduper/libpuzzle/src/vector_ops.c create mode 100644 deduper/thread_pool.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9a150e2 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +deduper/build diff --git a/deduper/CMakeLists.txt b/deduper/CMakeLists.txt new file mode 100644 index 0000000..ac0859d --- /dev/null +++ b/deduper/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.11.0) +project(deduper C CXX) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +include(FindPkgConfig) +pkg_search_module(gdlib REQUIRED gdlib) +find_package(Threads REQUIRED) + +add_subdirectory(libpuzzle/src) +add_executable(deduper deduper.cpp thread_pool.h) +target_link_directories(deduper + PRIVATE + ${gdlib_LIBRARY_DIRS} +) +target_link_libraries(deduper + puzzle + ${gdlib_LIBRARIES} + ${CMAKE_THREAD_LIBS_INIT} +) diff --git a/deduper/deduper.cpp b/deduper/deduper.cpp new file mode 100644 index 0000000..8f6e2f4 --- /dev/null +++ b/deduper/deduper.cpp @@ -0,0 +1,195 @@ +#include "libpuzzle/src/puzzle.h" + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include "thread_pool.h" + +PuzzleContext pzctx; +int ctr; +int recursive; +int njobs=1; +double threshold=0.3; +std::vector paths; + +int parse_arguments(int argc,char **argv) +{ + recursive=0; + int help=0; + option longopt[]= + { + {"recursive",no_argument ,&recursive,1}, +// {"destdir" ,required_argument,0 ,'D'}, + {"jobs" ,required_argument,0 ,'j'}, + {"threshold",required_argument,0 ,'d'}, + {"help" ,no_argument ,&help ,1}, + {0 ,0 ,0 ,0} + }; + while(1) + { + int idx=0; + int c=getopt_long(argc,argv,"rhj:d:",longopt,&idx); + if(!~c)break; + switch(c) + { + case 0: + if(longopt[idx].flag)break; + if(std::string("jobs")==longopt[idx].name) + sscanf(optarg,"%d",&njobs); + if(std::string("threshold")==longopt[idx].name) + sscanf(optarg,"%lf",&threshold); + break; + case 'r': + recursive=1; + break; + case 'h': + help=1; + break; + case 'j': + sscanf(optarg,"%d",&njobs); + break; + case 'd': + sscanf(optarg,"%lf",&threshold); + break; + } + } + for(;optind1||threshold<0) + { + puts("Invalid threshold value."); + return 2; + } + if(threshold<1e-6)threshold=1e-6; + if(!paths.size()) + { + puts("Missing image path."); + return 2; + } + return 0; +} + +void build_file_list(std::filesystem::path path,bool recursive,std::vector&out) +{ + if(recursive) + { + auto dirit=std::filesystem::recursive_directory_iterator(path); + for(auto &p:dirit) + { + FILE* fp=fopen(p.path().c_str(),"r"); + char c[8]; + fread((void*)c,1,6,fp); + if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)||!memcmp(c,"GIF87a",6)||!memcmp(c,"GIF89a",6)) + out.push_back(p.path().string()); + fclose(fp); + } + } + else + { + auto dirit=std::filesystem::directory_iterator(path); + for(auto &p:dirit) + { + FILE* fp=fopen(p.path().c_str(),"r"); + char c[8]; + fread((void*)c,1,6,fp); + if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)||!memcmp(c,"GIF87a",6)||!memcmp(c,"GIF89a",6)) + out.push_back(p.path().string()); + fclose(fp); + } + } +} + +void compute_signature_vectors(const std::vector&files,std::vector&output) +{ + thread_pool tp(njobs); + for(size_t i=0;i&vec,std::vector>&out) +{ + thread_pool tp(njobs); + for(size_t i=0;ifile#%lu\n",thid,ida,idb); + if(vec[ida].sizeof_vec&&vec[idb].sizeof_vec) + { + double d=puzzle_vector_normalized_distance(&pzctx,&vec[ida],&vec[idb],1); + if(dfile#%lu: %lf\n",ida,idb,d); + } + printf("%d/%lu\r",++ctr,vec.size()*(vec.size()-1)/2); + fflush(stdout); + }; + tp.create_task(job_func,i,j); + } + tp.wait(); +} + +int main(int argc,char** argv) +{ + if(int pr=parse_arguments(argc,argv))return pr-1; + puts("building list of files to compare..."); + std::vector x; + for(auto&p:paths) + build_file_list(p,recursive,x); + printf("%lu files to compare.\n",x.size()); + puts("computing signature vectors..."); + puzzle_init_context(&pzctx); + std::vector cvecs; + cvecs.resize(x.size()); + compute_signature_vectors(x,cvecs); + for(auto &v:cvecs) + { + fprintf(stderr,"%lu:",v.sizeof_vec); + for(size_t i=0;i> r; + compare_signature_vectors(cvecs,r); + puts(""); + for(auto &t:r) + printf("%s<->%s: %lf\n",x[std::get<0>(t)].c_str(),x[std::get<1>(t)].c_str(),std::get<2>(t)); + printf("%lu similar images.",r.size()); + for(auto &v:cvecs)puzzle_free_cvec(&pzctx,&v); + cvecs.clear(); + puzzle_free_context(&pzctx); + return 0; +} diff --git a/deduper/libpuzzle/AUTHORS b/deduper/libpuzzle/AUTHORS new file mode 100644 index 0000000..bb6ecb3 --- /dev/null +++ b/deduper/libpuzzle/AUTHORS @@ -0,0 +1 @@ +Frank DENIS diff --git a/deduper/libpuzzle/COPYING b/deduper/libpuzzle/COPYING new file mode 100644 index 0000000..30877ad --- /dev/null +++ b/deduper/libpuzzle/COPYING @@ -0,0 +1,17 @@ +/* + * ISC License + * + * Copyright (c) 2007-2015 Frank DENIS + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ diff --git a/deduper/libpuzzle/ChangeLog b/deduper/libpuzzle/ChangeLog new file mode 100644 index 0000000..e69de29 diff --git a/deduper/libpuzzle/Makefile.am b/deduper/libpuzzle/Makefile.am new file mode 100644 index 0000000..fce7f7b --- /dev/null +++ b/deduper/libpuzzle/Makefile.am @@ -0,0 +1,11 @@ +AUTOMAKE_OPTIONS = gnu + +EXTRA_DIST = \ + autogen.sh \ + THANKS \ + README-PHP + +SUBDIRS = \ + src \ + man \ + php diff --git a/deduper/libpuzzle/NEWS b/deduper/libpuzzle/NEWS new file mode 100644 index 0000000..e69de29 diff --git a/deduper/libpuzzle/README b/deduper/libpuzzle/README new file mode 100644 index 0000000..502a1c0 --- /dev/null +++ b/deduper/libpuzzle/README @@ -0,0 +1,202 @@ + + .:. LIBPUZZLE .:. + + http://libpuzzle.pureftpd.org + + + ------------------------ BLURB ------------------------ + + +The Puzzle library is designed to quickly find visually similar images (gif, +png, jpg), even if they have been resized, recompressed, recolored or slightly +modified. + +The library is free, lightweight yet very fast, configurable, easy to use and +it has been designed with security in mind. This is a C library, but it also +comes with a command-line tool and PHP bindings. + + + ------------------------ REFERENCE ------------------------ + + +The Puzzle library is a implementation of "An image signature for any kind of +image", by H. CHI WONG, Marschall BERN and David GOLDBERG. + + + ------------------------ COMPILATION ------------------------ + + +In order to load images, the library relies on the GD2 library. +You need to install gdlib2 and its development headers before compiling +libpuzzle. +The GD2 library is available as a pre-built package for most operating systems. +Debian and Ubuntu users should install the "libgd2-dev" or the "libgd2-xpm-dev" +package. +Gentoo users should install "media-libs/gd". +OpenBSD, NetBSD and DragonflyBSD users should install the "gd" package. +MacPorts users should install the "gd2" package. +X11 support is not required for the Puzzle library. + +Once GD2 has been installed, configure the Puzzle library as usual: + +./configure + +This is a standard autoconf script, if you're not familiar with it, please +have a look at the INSTALL file. + +Compile the beast: + +make + +Try the built-in tests: + +make check + +If everything looks fine, install the software: + +make install + +If anything goes wrong, please submit a bug report to: + libpuzzle [at] pureftpd [dot] org + + + ------------------------ USAGE ------------------------ + + +The API is documented in the libpuzzle(3) and puzzle_set(3) man pages. +You can also play with the puzzle-diff test application. +See puzzle-diff(8) for more info about the puzzle-diff application. + +In order to be thread-safe, every exported function of the library requires a +PuzzleContext object. That object stores various run-time tunables. + +Out of a bitmap picture, the Puzzle library can fill a PuzzleCVec object : + + PuzzleContext context; + PuzzleCVec cvec; + + puzzle_init_context(&context); + puzzle_init_cvec(&context, &cvec); + puzzle_fill_cvec_from_file(&context, &cvec, "directory/filename.jpg"); + +The PuzzleCvec structure holds two fields: + signed char *vec: a pointer to the first element of the vector + size_t sizeof_vec: the number of elements + +The size depends on the "lambdas" value (see puzzle_set(3)). + +PuzzleCvec structures can be compared: + + d = puzzle_vector_normalized_distance(&context, &cvec1, &cvec2, 1); + +d is the normalized distance between both vectors. If d is below 0.6, pictures +are probably similar. + +If you need further help, feel free to subscribe to the mailing-list (see +below). + + + ------------------------ INDEXING ------------------------ + + +How to quickly find similar pictures, if they are millions of records? + +The original paper has a simple, yet efficient answer. + +Cut the vector in fixed-length words. For instance, let's consider the +following vector: + +[ a b c d e f g h i j k l m n o p q r s t u v w x y z ] + +With a word length (K) of 10, you can get the following words: + +[ a b c d e f g h i j ] found at position 0 +[ b c d e f g h i j k ] found at position 1 +[ c d e f g h i j k l ] found at position 2 +etc. until position N-1 + +Then, index your vector with a compound index of (word + position). + +Even with millions of images, K = 10 and N = 100 should be enough to have very +little entries sharing the same index. + +Here's a very basic sample database schema: + ++-----------------------------+ +| signatures | ++-----------------------------+ +| sig_id | signature | pic_id | ++--------+-----------+--------+ + ++--------------------------+ +| words | ++--------------------------+ +| pos_and_word | fk_sig_id | ++--------------+-----------+ + +I'd recommend splitting at least the "words" table into multiple tables and/or +servers. + +By default (lambas=9) signatures are 544 bytes long. In order to save storage +space, they can be compressed to 1/third of their original size through the +puzzle_compress_cvec() function. Before use, they must be uncompressed with +puzzle_uncompress_cvec(). + + + ------------------------ PUZZLE-DIFF ------------------------ + + +A command-line tool is also available for scripting or testing. + +It is installed as "puzzle-diff" and comes with a man page. + +Sample usage: + +- Output distance between two images: + +$ puzzle-diff pic-a-0.jpg pics-a-1.jpg +0.102286 + +- Compare two images, exit with 10 if they look the same, exit with 20 if +they don't (may be useful for scripts): + +$ puzzle-diff -e pic-a-0.jpg pics-a-1.jpg +$ echo $? +10 + +- Compute distance, without cropping and with computing the average intensity +of the whole blocks: + +$ puzzle-diff -p 1.0 -c pic-a-0.jpg pic-a-1.jpg +0.0523151 + + + ------------------------ COMPARING IMAGES WITH PHP ------------------------ + + +A PHP extension is bundled with the Libpuzzle package, and it provides PHP +bindings to most functions of the library. + +Documentation for the Libpuzzle PHP extension is available in the README-PHP +file. + + + ------------------------ APPS USING LIBPUZZLE ------------------------ + + +Here are third-party projects using libpuzzle: + +* ftwin - http://jok.is-a-geek.net/ftwin.php + ftwin is a tool useful to find duplicate files according to their content on +your file system. + +* Python bindings for libpuzzle: PyPuzzle + https://github.com/ArchangelSDY/PyPuzzle + + + ------------------------ STATUS ------------------------ + + +This project is unfortunately not maintained any more. Pull requests are +always welcome, but I don't use this library any more and I don't have enough +spare time to actively work on it. diff --git a/deduper/libpuzzle/README-PHP b/deduper/libpuzzle/README-PHP new file mode 100644 index 0000000..6b14fb9 --- /dev/null +++ b/deduper/libpuzzle/README-PHP @@ -0,0 +1,76 @@ + + .:. LIBPUZZLE - PHP EXTENSION .:. + + http://libpuzzle.pureftpd.org + + + ------------------------ PHP EXTENSION ------------------------ + + +The Puzzle library can also be used through PHP, using a native extension. + +Prerequisites are the PHP headers, libtool, autoconf and automake. + +Here are the basic steps in order to install the extension: + +(on OpenBSD: export AUTOMAKE_VERSION=1.9 ; export AUTOCONF_VERSION=2.61) + +cd php/libpuzzle +phpize +./configure --with-libpuzzle +make clean +make +make install + +If libpuzzle is installed in a non-standard location, use: +./configure --with-libpuzzle=/base/directory/for/libpuzzle + +Then edit your php.ini file and add: + +extension=libpuzzle.so + + + ------------------------ USAGE ------------------------ + + +The PHP extension provides bindings for the following tuning functions: +- puzzle_set_max_width() +- puzzle_set_max_height() +- puzzle_set_lambdas() +- puzzle_set_noise_cutoff() +- puzzle_set_p_ratio() +- puzzle_set_contrast_barrier_for_cropping() +- puzzle_set_max_cropping_ratio() +- puzzle_set_autocrop() + +Have a look at the puzzle_set man page for more info about those. + +Getting the signature of a picture is as simple as: + +$signature = puzzle_fill_cvec_from_file($filename); + +In order to compute the similarity between two pictures using their +signatures, use: + +$d = puzzle_vector_normalized_distance($signature1, $signature2); + +The result is between 0.0 and 1.0, with 0.6 being a good threshold to detect +visually similar pictures. + +The PUZZLE_CVEC_SIMILARITY_THRESHOLD, PUZZLE_CVEC_SIMILARITY_HIGH_THRESHOLD, +PUZZLE_CVEC_SIMILARITY_LOW_THRESHOLD and PUZZLE_CVEC_SIMILARITY_LOWER_THRESHOLD +constants can also be used to get common thresholds : + +if ($d < PUZZLE_CVEC_SIMILARITY_THRESHOLD) { + echo "Pictures look similar\n"; +} + +Before storing a signature into a database, you can compress it in order to +save some storage space: + +$compressed_signature = puzzle_compress_cvec($signature); + +Before use, those compressed signatures must be uncompressed with: + +$signature = puzzle_uncompress_cvec($compressed_signature); + diff --git a/deduper/libpuzzle/THANKS b/deduper/libpuzzle/THANKS new file mode 100644 index 0000000..86ef2e1 --- /dev/null +++ b/deduper/libpuzzle/THANKS @@ -0,0 +1,6 @@ +Xerox Research Center +H. CHI WONG +Marschall BERN +David GOLDBERG +Sameh CHAFIK +Gregory MAXWELL diff --git a/deduper/libpuzzle/autogen.sh b/deduper/libpuzzle/autogen.sh new file mode 100755 index 0000000..4717fc4 --- /dev/null +++ b/deduper/libpuzzle/autogen.sh @@ -0,0 +1,17 @@ +#! /bin/sh + +if [ -x "`which autoreconf 2>/dev/null`" ] ; then + exec autoreconf -ivf +fi + +if glibtoolize --version > /dev/null 2>&1; then + LIBTOOLIZE='glibtoolize' +else + LIBTOOLIZE='libtoolize' +fi + +$LIBTOOLIZE && \ +aclocal && \ +autoheader && \ +automake --add-missing --force-missing --include-deps && \ +autoconf diff --git a/deduper/libpuzzle/composer.json b/deduper/libpuzzle/composer.json new file mode 100644 index 0000000..4cd00e2 --- /dev/null +++ b/deduper/libpuzzle/composer.json @@ -0,0 +1,10 @@ +{ + "name": "jedisct1/libpuzzle", + "description": "A library to quickly find visually similar images.", + "version": "0.10.0", + "license": "MIT", + "type": "library", + "require": { + "php": "5.*" + } +} diff --git a/deduper/libpuzzle/configure.ac b/deduper/libpuzzle/configure.ac new file mode 100644 index 0000000..1abf0f6 --- /dev/null +++ b/deduper/libpuzzle/configure.ac @@ -0,0 +1,70 @@ +# -*- Autoconf -*- +# Process this file with autoconf to produce a configure script. + +AC_PREREQ(2.61) +AC_INIT(libpuzzle, 0.11, bugs@pureftpd.org) +AC_CONFIG_SRCDIR([src/puzzle.h]) +AC_CONFIG_HEADER([config.h]) +AM_INIT_AUTOMAKE([1.9 dist-bzip2]) +AM_MAINTAINER_MODE + +# Checks for programs. +AC_PROG_CXX +AC_PROG_CC +AC_PROG_CPP +AC_PROG_INSTALL +AC_PROG_LN_S +AC_PROG_MAKE_SET +AC_PATH_PROG(GDLIBCONFIG, [gdlib-config]) +CPPFLAGS="$CPPFLAGS -D_GNU_SOURCE=1" +CPPFLAGS="$CPPFLAGS `$GDLIBCONFIG --cflags`" +LDFLAGS="$LDFLAGS `$GDLIBCONFIG --ldflags`" +LDADD="$LDADD `$GDLIBCONFIG --libs`" + +# Checks for libraries. + +AC_CHECK_LIB([gd], [gdImageCreateFromGd2],, + AC_ERROR([libgd2 development files not found])) + +# Checks for header files. +AC_HEADER_STDC +AM_PROG_LIBTOOL +AC_CHECK_HEADERS([limits.h memory.h stddef.h stdlib.h string.h unistd.h]) + +# Checks for typedefs, structures, and compiler characteristics. +AC_C_CONST +AC_TYPE_SIZE_T +AC_TYPE_SSIZE_T +AC_TYPE_OFF_T + +# Checks for library functions. +AC_FUNC_MALLOC +AC_FUNC_REALLOC +AC_FUNC_MEMCMP +AC_CHECK_FUNC([floor], ,[AC_CHECK_LIB([math], [floor])]) +AC_CHECK_FUNC([round], ,[AC_CHECK_LIB([math], [round])]) +AC_CHECK_FUNCS([strtoul]) + +AC_SUBST([MAINT]) + +AC_CONFIG_FILES([Makefile + man/Makefile + src/Makefile + src/pics/Makefile + php/Makefile + php/libpuzzle/Makefile + php/libpuzzle/include/Makefile + php/libpuzzle/modules/Makefile + php/libpuzzle/build/Makefile + php/libpuzzle/tests/Makefile + php/libpuzzle/tests/pics/Makefile + php/examples/Makefile + php/examples/similar/Makefile + ]) +AC_OUTPUT + +AC_MSG_NOTICE([+-------------------------------------------------------+]) +AC_MSG_NOTICE([| You can subscribe to the Libpuzzle users mailing-list |]) +AC_MSG_NOTICE([| to ask for help and to stay informed of new releases. |]) +AC_MSG_NOTICE([| Go to http://libpuzzle.pureftpd.org/ml/ now! |]) +AC_MSG_NOTICE([+-------------------------------------------------------+]) diff --git a/deduper/libpuzzle/man/Makefile.am b/deduper/libpuzzle/man/Makefile.am new file mode 100644 index 0000000..a3a78a5 --- /dev/null +++ b/deduper/libpuzzle/man/Makefile.am @@ -0,0 +1,7 @@ +man_MANS = \ + libpuzzle.3 \ + puzzle_set.3 \ + puzzle-diff.8 + +EXTRA_DIST = \ + $(man_MANS) diff --git a/deduper/libpuzzle/man/libpuzzle.3 b/deduper/libpuzzle/man/libpuzzle.3 new file mode 100644 index 0000000..98cfcbb --- /dev/null +++ b/deduper/libpuzzle/man/libpuzzle.3 @@ -0,0 +1,296 @@ +.\" +.\" Copyright (c) 2007-2014 Frank DENIS +.\" +.\" Permission to use, copy, modify, and distribute this software for any +.\" purpose with or without fee is hereby granted, provided that the above +.\" copyright notice and this permission notice appear in all copies. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +.\" +.Dd $Mdocdate: March 31 2011 $ +.Dt LIBPUZZLE 3 +.Sh NAME +.Nm puzzle_init_cvec , +.Nm puzzle_init_dvec , +.Nm puzzle_fill_dvec_from_file , +.Nm puzzle_fill_cvec_from_file , +.Nm puzzle_fill_dvec_from_mem , +.Nm puzzle_fill_cvec_from_mem , +.Nm puzzle_fill_cvec_from_dvec , +.Nm puzzle_free_cvec , +.Nm puzzle_free_dvec , +.Nm puzzle_init_compressed_cvec , +.Nm puzzle_free_compressed_cvec , +.Nm puzzle_compress_cvec , +.Nm puzzle_uncompress_cvec , +.Nm puzzle_vector_normalized_distance +.Nd compute comparable signatures of bitmap images. +.Sh SYNOPSIS +.Fd #include +.Ft void +.Fn puzzle_init_context "PuzzleContext *context" +.Ft void +.Fn puzzle_free_context "PuzzleContext *context" +.Ft void +.Fn puzzle_init_cvec "PuzzleContext *context" "PuzzleCvec *cvec" +.Ft void +.Fn puzzle_init_dvec "PuzzleContext *context" "PuzzleDvec *dvec" +.Ft int +.Fn puzzle_fill_dvec_from_file "PuzzleContext *context" "PuzzleDvec * dvec" "const char *file" +.Ft int +.Fn puzzle_fill_cvec_from_file "PuzzleContext *context" "PuzzleCvec * cvec" "const char *file" +.Ft int +.Fn puzzle_fill_dvec_from_mem "PuzzleContext *context" "PuzzleDvec * dvec" "const void *mem" "size_t size" +.Ft int +.Fn puzzle_fill_cvec_from_mem "PuzzleContext *context" "PuzzleCvec * cvec" "const void *mem" "size_t size" +.Ft int +.Fn puzzle_fill_cvec_from_dvec "PuzzleContext *context" "PuzzleCvec * cvec" "const PuzzleDvec *dvec" +.Ft void +.Fn puzzle_free_cvec "PuzzleContext *context" "PuzzleCvec *cvec" +.Ft void +.Fn puzzle_free_dvec "PuzzleContext *context" "PuzzleDvec *dvec" +.Ft void +.Fn puzzle_init_compressed_cvec "PuzzleContext *context" "PuzzleCompressedCvec * compressed_cvec" +.Ft void +.Fn puzzle_free_compressed_cvec "PuzzleContext *context" "PuzzleCompressedCvec * compressed_cvec" +.Ft int +.Fn puzzle_compress_cvec "PuzzleContext *context" "PuzzleCompressedCvec * compressed_cvec" "const PuzzleCvec * cvec" +.Ft int +.Fn puzzle_uncompress_cvec "PuzzleContext *context" "PuzzleCompressedCvec * compressed_cvec" "PuzzleCvec * const cvec" +.Ft double +.Fn puzzle_vector_normalized_distance "PuzzleContext *context" "const PuzzleCvec * cvec1" "const PuzzleCvec * cvec2" "int fix_for_texts" +.Sh DESCRIPTION +The Puzzle library computes a signature out of a bitmap picture. +Signatures are comparable and similar pictures have similar signatures. +.Pp +After a picture has been loaded and uncompressed, featureless parts of +the image are skipped (autocrop), unless that step has been explicitely +disabled, see +.Xr puzzle_set 3 +.Sh LIBPUZZLE CONTEXT +Every public function requires a +.Va PuzzleContext +object, that stores every required tunables. +.Pp +Any application using libpuzzle should initialize a +.Va PuzzleContext +object with +.Fn puzzle_init_context +and free it after use with +.Fn puzzle_free_context +.Bd \-literal \-offset indent +PuzzleContext context; + +puzzle_init_context(&context); + ... +puzzle_free_context(&context); +.Ed +.Sh DVEC AND CVEC VECTORS +The next step is to divide the cropped image into a grid and to compute +the average intensity of soft\(hyedged pixels in every block. The result is a +.Va PuzzleDvec +object. +.Pp +.Va PuzzleDvec +objects should be initialized before use, with +.Fn puzzle_init_dvec +and freed after use with +.Fn puzzle_free_dvec +.Pp +The +.Va PuzzleDvec +structure has two important fields: +.Va vec +is the pointer to the first element of the array containing the average +intensities, and +.Va sizeof_compressed_vec +is the number of elements. +.Pp +.Va PuzzleDvec +objects are not comparable, so what you usually want is to transform these +objects into +.Va PuzzleCvec +objects. +.Pp +A +.Va PuzzleCvec +object is a vector with relationships between adjacent blocks from a +.Va PuzzleDvec +object. +.Pp +The +.Fn puzzle_fill_cvec_from_dvec +fills a +.Va PuzzleCvec +object from a +.Va PuzzleDvec +object. +.Pp +But just like the other structure, +.Va PuzzleCvec +objects must be initialized and freed with +.Fn puzzle_init_cvec +and +.Fn puzzle_free_cvec +.Pp +.Va PuzzleCvec +objects have a vector whoose first element is in the +.Va vec +field, and the number of elements is in the +.Va sizeof_vec +field +.Sh LOADING PICTURES +.Va PuzzleDvec +and +.Va PuzzleCvec +objects can be computed from a bitmap picture file, with +.Fn puzzle_fill_dvec_from_file +and +.Fn puzzle_fill_cvec_from_file +.Pp +.Em GIF +, +.Em PNG +and +.Em JPEG +files formats are currently supported and automatically recognized. +.Pp +Here's a simple example that creates a +.Va PuzzleCvec +objects out of a file. +.Bd \-literal \-offset indent +PuzzleContext context; +PuzzleCvec cvec; + +puzzle_init_context(&context); +puzzle_init_cvec(&context, &cvec); +puzzle_fill_cvec_from_file(&context, &cvec, "test\-picture.jpg"); + ... +puzzle_free_cvec(&context, &cvec); +puzzle_free_context(&context); +.Ed +.Sh COMPARING VECTORS +In order to check whether two pictures are similar, you need to compare their +.Va PuzzleCvec +signatures, using +.Fn puzzle_vector_normalized_distance +.Pp +That function returns a distance, between 0.0 and 1.0. The lesser, the nearer. +.Pp +Tests on common pictures show that a normalized distance of 0.6 (also defined as +.Va PUZZLE_CVEC_SIMILARITY_THRESHOLD +) means that both pictures are visually similar. +.Pp +If that threshold is not right for your set of pictures, you can experiment +with +.Va PUZZLE_CVEC_SIMILARITY_HIGH_THRESHOLD +, +.Va PUZZLE_CVEC_SIMILARITY_LOW_THRESHOLD +and +.Va PUZZLE_CVEC_SIMILARITY_LOWER_THRESHOLD +or with your own value. +.Pp +If the +.Fa fix_for_texts +of +.Fn puzzle_vector_normalized_distance +is +.Em 1 +, a fix is applied to the computation in order to deal with bitmap pictures +that contain text. That fix is recommended, as it allows using the same +threshold for that kind of picture as for generic pictures. +.Pp +If +.Fa fix_for_texts +is +.Em 0 +, that special way of computing the normalized distance is disabled. +.Bd \-literal \-offset indent +PuzzleContext context; +PuzzleCvec cvec1, cvec2; +double d; + +puzzle_init_context(&context); +puzzle_init_cvec(&context, &cvec1); +puzzle_init_cvec(&context, &cvec2); +puzzle_fill_cvec_from_file(&context, &cvec1, "test\-picture\-1.jpg"); +puzzle_fill_cvec_from_file(&context, &cvec2, "test\-picture\-2.jpg"); +d = puzzle_vector_normalized_distance(&context, &cvec1, &cvec2, 1); +if (d < PUZZLE_CVEC_SIMILARITY_THRESHOLD) { + puts("Pictures are similar"); +} +puzzle_free_cvec(&context, &cvec2); +puzzle_free_cvec(&context, &cvec1); +puzzle_free_context(&context); +.Ed +.Sh CVEC COMPRESSION +In order to reduce storage needs, +.Va PuzzleCvec +objects can be compressed to 1/3 of their original size. +.Pp +.Va PuzzleCompressedCvec +structures hold the compressed data. Before and after use, these structures +have to be passed to +.Fn puzzle_init_compressed_cvec +and +.Fn puzzle_free_compressed_cvec +.Pp +.Fn puzzle_compress_cvec +compresses a +.Va PuzzleCvec +object into a +.Va PuzzleCompressedCvec +object. +.Pp +And +.Fn puzzle_uncompress_cvec +uncompresses a +.Va PuzzleCompressedCvec +object into a +.Va PuzzleCvec +object. +.Bd \-literal \-offset indent +PuzzleContext context; +PuzzleCvec cvec; +PuzzleCompressedCvec c_cvec; + ... +puzzle_init_compressed_cvec(&context, &c_cvec); +puzzle_compress_cvec(&context, &c_cvec, &cvec); + ... +puzzle_free_compressed_cvec(&context, &c_cvec); +.Ed +The +.Va PuzzleCompressedCvec +structure has two important fields: +.Va vec +that is a pointer to the first element of the compressed data, and +.Va sizeof_compressed_vec +that contains the number of elements. +.Sh RETURN VALUE +Functions return +.Em 0 +on success, and +.Em \-1 +if something went wrong. +.Sh AUTHORS +.Nf +Frank DENIS +libpuzzle at pureftpd dot org +.Fi +.Sh ACKNOWLEDGMENTS +.Nf +Xerox Research Center +H. CHI WONG +Marschall BERN +David GOLDBERG +Sameh SCHAFIK +.Fi +.Sh SEE ALSO +.Xr puzzle_set 3 +.Xr puzzle\-diff 8 diff --git a/deduper/libpuzzle/man/puzzle-diff.8 b/deduper/libpuzzle/man/puzzle-diff.8 new file mode 100644 index 0000000..5744b5a --- /dev/null +++ b/deduper/libpuzzle/man/puzzle-diff.8 @@ -0,0 +1,58 @@ +.\" +.\" Copyright (c) 2007-2014 Frank DENIS +.\" +.\" Permission to use, copy, modify, and distribute this software for any +.\" purpose with or without fee is hereby granted, provided that the above +.\" copyright notice and this permission notice appear in all copies. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +.\" +.Dd $Mdocdate: September 23 2007 $ +.Dt PUZZLE-DIFF 1 +.Os +.Sh NAME +.Nm puzzle\-diff +.Nd compare pictures with libpuzzle +.Sh SYNOPSIS +.Nm puzzle\-diff +[\-b ] +[\-e] [\-E ] [\-h] [\-H ] [\-l ] +[\-n ] [\-p

] [\-t] [\-W ] + + +.Sh DESCRIPTION +puzzle\-diff compares two pictures and outputs the normalized distance. +.Pp +Try +.Em puzzle\-diff \-h +for more info. +.Sh EXAMPLES +Output distance between two images: +.Bd -literal -offset indent +$ puzzle\-diff pic\-a\-0.jpg pics\-a\-1.jpg +0.102286 +.Ed +.Pp +Compare two images, exit with 10 if they look the same, exit with 20 if +they don't (may be useful for scripts): +.Bd -literal -offset indent +$ puzzle\-diff \-e pic\-a\-0.jpg pics\-a\-1.jpg +$ echo $? +10 +.Ed +.Pp +Compute distance, without cropping and with computing the average intensity +of the whole blocks: +.Bd -literal -offset indent +$ puzzle\-diff \-p 1.0 \-c pic\-a\-0.jpg pic\-a\-1.jpg +0.0523151 +.Ed +.Sh SEE ALSO +.Xr libpuzzle 3 +.Xr puzzle_set 3 diff --git a/deduper/libpuzzle/man/puzzle_set.3 b/deduper/libpuzzle/man/puzzle_set.3 new file mode 100644 index 0000000..a8d017b --- /dev/null +++ b/deduper/libpuzzle/man/puzzle_set.3 @@ -0,0 +1,129 @@ +.\" +.\" Copyright (c) 2007-2014 Frank DENIS +.\" +.\" Permission to use, copy, modify, and distribute this software for any +.\" purpose with or without fee is hereby granted, provided that the above +.\" copyright notice and this permission notice appear in all copies. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +.\" +.Dd $Mdocdate: September 24 2007 $ +.Dt PUZZLE_SET 3 +.Sh NAME +.Nm puzzle_set_max_width , +.Nm puzzle_set_max_height , +.Nm puzzle_set_lambdas , +.Nm puzzle_set_p_ratio , +.Nm puzzle_set_noise_cutoff , +.Nm puzzle_set_contrast_barrier_for_cropping , +.Nm puzzle_set_max_cropping_ratio , +.Nm puzzle_set_autocrop +.Nd set tunables for libpuzzle functions. +.Sh SYNOPSIS +.Fd #include +.Ft int +.Fn puzzle_set_max_width "PuzzleContext *context" "unsigned int width" +.Ft int +.Fn puzzle_set_max_height "PuzzleContext *context" "unsigned int height" +.Ft int +.Fn puzzle_set_lambdas "PuzzleContext *context" "unsigned int lambdas" +.Ft int +.Fn puzzle_set_p_ratio "PuzzleContext *context" "double p_ratio" +.Ft int +.Fn puzzle_set_noise_cutoff "PuzzleContext *context" "double noise_cutoff" +.Ft int +.Fn puzzle_set_contrast_barrier_for_cropping "PuzzleContext *context" "double barrier" +.Ft int +.Fn puzzle_set_max_cropping_ratio "PuzzleContext *context" "double ratio" +.Ft int +.Fn puzzle_set_autocrop "PuzzleContext *context" "int enable" +.Sh DESCRIPTION +While default values have been chosen to be ok for most people, the +.Fn puzzle_set_* +functions are knobs to fit the algorithm to your set of data and to your +applications. +.Sh LAMBDAS +By default, pictures are divided in 9 x 9 blocks. +.Pp +.Em 9 +is the +.Em lambdas +value, and it can be changed with +.Fn puzzle_set_lambdas +.Pp +For large databases, for complex images, for images with a lot of text or +for sets of near\(hysimilar images, it might be better to raise that value to +.Em 11 +or even +.Em 13 +.Pp +However, raising that value obviously means that vectors will require more +storage space. +.Pp +The +.Em lambdas +value should remain the same in order to get comparable vectors. So if you +pick +.Em 11 +(for instance), you should always use that value for all pictures you will +compute a digest for. +.Fn puzzle_set_p_ratio +.Pp +The average intensity of each block is based upon a small centered zone. +.Pp +The "p ratio" determines the size of that zone. The default is 2.0, and that +ratio mimics the behavior that is described in the reference algorithm. +.Pp +For very specific cases (complex images) or if you get too many false +positives, as an alternative to increasing lambdas, you can try to lower that +value, for instance to 1.5. +.Pp +The lowest acceptable value is 1.0. +.Sh MAXIMUM SIZES +In order to avoid CPU starvation, pictures won't be processed if their width +or height is larger than 3000 pixels. +.Pp +These limits are rather large, but if you ever need to change them, the +.Fn puzzle_set_max_width +and +.Fn puzzle_set_max_height +are available. +.Sh NOISE CUTOFF +The noise cutoff defaults to 2. If you raise that value, more zones with +little difference of intensity will be considered as similar. +.Pp +Unless you have very specialized sets of pictures, you probably don't want +to change this. +.Sh AUTOCROP +By default, featureless borders of the original image are ignored. The size +of each border depends on the sum of absolute values of differences between +adjacent pixels, relative to the total sum. +.Pp +That feature can be disabled with +.Fn puzzle_set_autocrop "0" +Any other value will enable it. +.Pp +.Fn puzzle_set_contrast_barrier_for_cropping +changes the tolerance. The default value is 5. Less shaves less, more shaves +more. +.Pp +.Fn puzzle_set_max_cropping_ratio +This is a safe\(hyguard against unwanted excessive auto\(hycropping. +.Pp +The default (0.25) means that no more than 25% of the total width (or +height) will ever be shaved. +.Sh RETURN VALUE +Functions return +.Em 0 +on success, and +.Em \-1 +if something went wrong. +.Sh SEE ALSO +.Xr libpuzzle 3 +.Xr puzzle\-diff 8 diff --git a/deduper/libpuzzle/php/Makefile.am b/deduper/libpuzzle/php/Makefile.am new file mode 100644 index 0000000..dc0165f --- /dev/null +++ b/deduper/libpuzzle/php/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = \ + libpuzzle \ + examples diff --git a/deduper/libpuzzle/php/examples/Makefile.am b/deduper/libpuzzle/php/examples/Makefile.am new file mode 100644 index 0000000..82c81ba --- /dev/null +++ b/deduper/libpuzzle/php/examples/Makefile.am @@ -0,0 +1,2 @@ +SUBDIRS = \ + similar diff --git a/deduper/libpuzzle/php/examples/similar/Makefile.am b/deduper/libpuzzle/php/examples/similar/Makefile.am new file mode 100644 index 0000000..126f6df --- /dev/null +++ b/deduper/libpuzzle/php/examples/similar/Makefile.am @@ -0,0 +1,6 @@ +EXTRA_DIST = \ + schema.sqlite3.sql \ + schema.pgsql.sql \ + similar.php \ + similar.inc.php \ + config.inc.php diff --git a/deduper/libpuzzle/php/examples/similar/config.inc.php b/deduper/libpuzzle/php/examples/similar/config.inc.php new file mode 100644 index 0000000..d4e3b41 --- /dev/null +++ b/deduper/libpuzzle/php/examples/similar/config.inc.php @@ -0,0 +1,9 @@ + diff --git a/deduper/libpuzzle/php/examples/similar/schema.pgsql.sql b/deduper/libpuzzle/php/examples/similar/schema.pgsql.sql new file mode 100644 index 0000000..7dc6bc1 --- /dev/null +++ b/deduper/libpuzzle/php/examples/similar/schema.pgsql.sql @@ -0,0 +1,230 @@ +-- +-- PostgreSQL database dump +-- + +SET client_encoding = 'UTF8'; +SET standard_conforming_strings = off; +SET check_function_bodies = false; +SET client_min_messages = warning; +SET escape_string_warning = off; + +SET SESSION AUTHORIZATION 'similar'; + +-- +-- Name: SCHEMA public; Type: COMMENT; Schema: -; Owner: similar +-- + +COMMENT ON SCHEMA public IS 'Standard public schema'; + + +SET search_path = public, pg_catalog; + +SET default_tablespace = ''; + +SET default_with_oids = false; + +-- +-- Name: pictures; Type: TABLE; Schema: public; Owner: similar; Tablespace: +-- + +CREATE TABLE pictures ( + id integer NOT NULL, + digest character(32) NOT NULL, + CONSTRAINT ck_digest CHECK ((char_length(digest) = 32)) +); + + +-- +-- Name: pictures_id_seq; Type: SEQUENCE; Schema: public; Owner: similar +-- + +CREATE SEQUENCE pictures_id_seq + START WITH 1 + INCREMENT BY 1 + NO MAXVALUE + NO MINVALUE + CACHE 1; + + +-- +-- Name: pictures_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: similar +-- + +ALTER SEQUENCE pictures_id_seq OWNED BY pictures.id; + + +-- +-- Name: sentpictures; Type: TABLE; Schema: public; Owner: similar; Tablespace: +-- + +CREATE TABLE sentpictures ( + id integer NOT NULL, + url character varying(255) NOT NULL, + sender character varying(100) NOT NULL, + picture_id integer NOT NULL, + CONSTRAINT ck_url CHECK (((url)::text <> ''::text)) +); + + +-- +-- Name: sentpictures_id_seq; Type: SEQUENCE; Schema: public; Owner: similar +-- + +CREATE SEQUENCE sentpictures_id_seq + START WITH 1 + INCREMENT BY 1 + NO MAXVALUE + NO MINVALUE + CACHE 1; + + +-- +-- Name: sentpictures_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: similar +-- + +ALTER SEQUENCE sentpictures_id_seq OWNED BY sentpictures.id; + + +-- +-- Name: signatures; Type: TABLE; Schema: public; Owner: similar; Tablespace: +-- + +CREATE TABLE signatures ( + id integer NOT NULL, + compressed_signature bytea NOT NULL, + picture_id integer NOT NULL, + CONSTRAINT ck_signature CHECK ((octet_length(compressed_signature) >= 182)) +); + + +-- +-- Name: signatures_id_seq; Type: SEQUENCE; Schema: public; Owner: similar +-- + +CREATE SEQUENCE signatures_id_seq + START WITH 1 + INCREMENT BY 1 + NO MAXVALUE + NO MINVALUE + CACHE 1; + + +-- +-- Name: signatures_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: similar +-- + +ALTER SEQUENCE signatures_id_seq OWNED BY signatures.id; + + +-- +-- Name: words; Type: TABLE; Schema: public; Owner: similar; Tablespace: +-- + +CREATE TABLE words ( + pos_and_word bytea NOT NULL, + signature_id integer NOT NULL, + CONSTRAINT ck_pos_and_word CHECK ((octet_length(pos_and_word) >= 2)) +); + + +-- +-- Name: id; Type: DEFAULT; Schema: public; Owner: similar +-- + +ALTER TABLE pictures ALTER COLUMN id SET DEFAULT nextval('pictures_id_seq'::regclass); + + +-- +-- Name: id; Type: DEFAULT; Schema: public; Owner: similar +-- + +ALTER TABLE sentpictures ALTER COLUMN id SET DEFAULT nextval('sentpictures_id_seq'::regclass); + + +-- +-- Name: id; Type: DEFAULT; Schema: public; Owner: similar +-- + +ALTER TABLE signatures ALTER COLUMN id SET DEFAULT nextval('signatures_id_seq'::regclass); + + +-- +-- Name: pictures_pkey; Type: CONSTRAINT; Schema: public; Owner: similar; Tablespace: +-- + +ALTER TABLE ONLY pictures + ADD CONSTRAINT pictures_pkey PRIMARY KEY (id); + + +-- +-- Name: sentpictures_pkey; Type: CONSTRAINT; Schema: public; Owner: similar; Tablespace: +-- + +ALTER TABLE ONLY sentpictures + ADD CONSTRAINT sentpictures_pkey PRIMARY KEY (id); + + +-- +-- Name: signatures_pkey; Type: CONSTRAINT; Schema: public; Owner: similar; Tablespace: +-- + +ALTER TABLE ONLY signatures + ADD CONSTRAINT signatures_pkey PRIMARY KEY (id); + + +-- +-- Name: idx_digest; Type: INDEX; Schema: public; Owner: similar; Tablespace: +-- + +CREATE UNIQUE INDEX idx_digest ON pictures USING btree (digest); + + +-- +-- Name: idx_picture_id; Type: INDEX; Schema: public; Owner: similar; Tablespace: +-- + +CREATE INDEX idx_picture_id ON sentpictures USING btree (picture_id); + + +-- +-- Name: idx_pos_and_word; Type: INDEX; Schema: public; Owner: similar; Tablespace: +-- + +CREATE INDEX idx_pos_and_word ON words USING btree (pos_and_word); + + +-- +-- Name: idx_url; Type: INDEX; Schema: public; Owner: similar; Tablespace: +-- + +CREATE UNIQUE INDEX idx_url ON sentpictures USING btree (url); + + +-- +-- Name: sentpictures_picture_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: similar +-- + +ALTER TABLE ONLY sentpictures + ADD CONSTRAINT sentpictures_picture_id_fkey FOREIGN KEY (picture_id) REFERENCES pictures(id) ON UPDATE CASCADE ON DELETE CASCADE; + + +-- +-- Name: signatures_picture_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: similar +-- + +ALTER TABLE ONLY signatures + ADD CONSTRAINT signatures_picture_id_fkey FOREIGN KEY (picture_id) REFERENCES pictures(id) ON UPDATE CASCADE ON DELETE CASCADE; + + +-- +-- Name: words_signature_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: similar +-- + +ALTER TABLE ONLY words + ADD CONSTRAINT words_signature_id_fkey FOREIGN KEY (signature_id) REFERENCES signatures(id) ON UPDATE CASCADE ON DELETE CASCADE; + + +-- +-- PostgreSQL database dump complete +-- + diff --git a/deduper/libpuzzle/php/examples/similar/schema.sqlite3.sql b/deduper/libpuzzle/php/examples/similar/schema.sqlite3.sql new file mode 100644 index 0000000..dc5a6c3 --- /dev/null +++ b/deduper/libpuzzle/php/examples/similar/schema.sqlite3.sql @@ -0,0 +1,23 @@ +CREATE TABLE pictures ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + digest CHAR(32) NOT NULL +); +CREATE TABLE sentpictures ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url VARCHAR(255) NOT NULL, + sender VARCHAR(100) NOT NULL, + picture_id INTEGER NOT NULL +); +CREATE TABLE signatures ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + compressed_signature CHAR(182) NOT NULL, + picture_id INTEGER NOT NULL +); +CREATE TABLE words ( + pos_and_word CHAR(5) NOT NULL, + signature_id INTEGER NOT NULL +); +CREATE UNIQUE INDEX idx_digest ON pictures(digest); +CREATE INDEX idx_picture_id ON sentpictures (picture_id); +CREATE INDEX idx_pos_and_word ON words(pos_and_word); +CREATE UNIQUE INDEX idx_url ON sentpictures (url); diff --git a/deduper/libpuzzle/php/examples/similar/similar.inc.php b/deduper/libpuzzle/php/examples/similar/similar.inc.php new file mode 100644 index 0000000..cfc806e --- /dev/null +++ b/deduper/libpuzzle/php/examples/similar/similar.inc.php @@ -0,0 +1,120 @@ +beginTransaction(); + try { + $st = $dbh->prepare + ('DELETE FROM sentpictures WHERE url = :url'); + $st->execute(array(':url' => $url)); + $st = $dbh->prepare + ('SELECT id FROM pictures WHERE digest = :digest'); + $st->execute(array(':digest' => $md5)); + $picture_id = $st->fetchColumn(); + $st->closeCursor(); + $duplicate = TRUE; + if ($picture_id === FALSE) { + $duplicate = FALSE; + $st = $dbh->prepare + ('INSERT INTO pictures (digest) VALUES (:digest)'); + $st->execute(array(':digest' => $md5)); + $picture_id = $dbh->lastInsertId('id'); + } + $st = $dbh->prepare + ('INSERT INTO sentpictures (url, sender, picture_id) ' . + 'VALUES (:url, :sender, :picture_id)'); + $st->execute(array(':url' => $url, ':sender' => $client_info, + ':picture_id' => $picture_id)); + if ($duplicate === TRUE) { + $dbh->commit(); + return TRUE; + } + $st = $dbh->prepare + ('INSERT INTO signatures (compressed_signature, picture_id) ' . + 'VALUES(:compressed_signature, :picture_id)'); + $st->execute(array(':compressed_signature' => $compressed_cvec, + ':picture_id' => $picture_id)); + $signature_id = $dbh->lastInsertId('id'); + $st = $dbh->prepare + ('INSERT INTO words (pos_and_word, signature_id) ' . + 'VALUES (:pos_and_word, :signature_id)'); + foreach ($words as $u => $word) { + $st->execute(array('pos_and_word' + => chr($u) . puzzle_compress_cvec($word), + 'signature_id' => $signature_id)); + } + $dbh->commit(); + } catch (Exception $e) { + var_dump($e); + $dbh->rollback(); + } + return TRUE; +} + +function find_similar_pictures($md5, $cvec, + $threshold = PUZZLE_CVEC_SIMILARITY_THRESHOLD) { + $compressed_cvec = puzzle_compress_cvec($cvec); + $words = split_into_words($cvec); + $dbh = new PDO(DB_DSN); + $dbh->beginTransaction(); + $sql = 'SELECT DISTINCT(signature_id) AS signature_id FROM words ' . + 'WHERE pos_and_word IN ('; + $coma = FALSE; + foreach ($words as $u => $word) { + if ($coma === TRUE) { + $sql .= ','; + } + $sql .= $dbh->quote(chr($u) . puzzle_compress_cvec($word)); + $coma = TRUE; + } + $sql .= ')'; + $res_words = $dbh->query($sql); + $scores = array(); + $st = $dbh->prepare('SELECT compressed_signature, picture_id ' . + 'FROM signatures WHERE id = :id'); + while (($signature_id = $res_words->fetchColumn()) !== FALSE) { + $st->execute(array(':id' => $signature_id)); + $row = $st->fetch(); + $found_compressed_signature = $row['compressed_signature']; + $picture_id = $row['picture_id']; + $found_cvec = puzzle_uncompress_cvec($found_compressed_signature); + $distance = puzzle_vector_normalized_distance($cvec, $found_cvec); + if ($distance < $threshold && $distance > 0.0) { + $scores[$picture_id] = $distance; + } + } + $sql = 'SELECT url FROM sentpictures WHERE picture_id IN ('; + $coma = FALSE; + foreach ($scores as $picture_id => $score) { + if ($coma === TRUE) { + $sql .= ','; + } + $sql .= $dbh->quote($picture_id); + $coma = TRUE; + } + $sql .= ')'; + $urls = array(); + if (!empty($scores)) { + $res_urls = $dbh->query($sql); + while (($url = $res_urls->fetchColumn()) !== FALSE) { + array_push($urls, $url); + } + } + $dbh->commit(); + + return $urls; +} + +?> diff --git a/deduper/libpuzzle/php/examples/similar/similar.php b/deduper/libpuzzle/php/examples/similar/similar.php new file mode 100644 index 0000000..4b3ad40 --- /dev/null +++ b/deduper/libpuzzle/php/examples/similar/similar.php @@ -0,0 +1,158 @@ + + + + +

Similar images finder using libpuzzle

+' . "\n"; + echo 'Enter an image URL (http only):' . "\n"; + echo '' . "\n"; + echo ''; + echo '' . "\n"; +} + +function display_error($err) { + echo '
' . htmlspecialchars($err) . '
' . "\n"; +} + +function display_loading() { + echo '
Loading...
' . "\n"; + @ob_flush(); flush(); +} + +function display_loaded() { + echo '
Loaded.
' . "\n"; + @ob_flush(); flush(); +} + +function display_signature_ok() { + echo '
Signature computed.
' . "\n"; + @ob_flush(); flush(); +} + +function remove_tmpfile($file) { + @unlink($file); +} + +function get_client_info() { + return @$_SERVER['REMOTE_ADDR'] . '/' . time(); +} + +function display_similar_pictures($urls) { + echo '
' . "\n"; + foreach ($urls as $url) { + echo ''; + echo ' '; + echo '' . "\n"; + + } + echo '
' . "\n"; +} + +function record_url($url, &$md5, &$cvec) { + if (function_exists('sys_get_temp_dir')) { + $tmpdir = sys_get_temp_dir(); + } else { + $tmpdir = '/tmp'; + } + $dfn = tempnam($tmpdir, 'similar-' . md5(uniqid(mt_rand(), TRUE))); + register_shutdown_function('remove_tmpfile', $dfn); + if (($dfp = fopen($dfn, 'w')) == FALSE) { + display_form(); + display_error('Unable to create the temporary file'); + return FALSE; + } + if (($fp = fopen($url, 'r')) == FALSE) { + display_form(); + display_error('Unable to open: [' . $url . ']'); + return FALSE; + } + $f = fread($fp, 4096); + $written = strlen($f); + if (empty($f)) { + display_form(); + display_error('Unable to load: [' . $url . ']'); + return FALSE; + } + fwrite($dfp, $f); + $infos = @getimagesize($dfn); + if (empty($infos) || + ($infos[2] !== IMAGETYPE_GIF && $infos[2] !== IMAGETYPE_JPEG && + $infos[2] !== IMAGETYPE_PNG) || + $infos[0] < 50 || $infos[1] < 50) { + fclose($dfp); + display_form(); + display_error('Unsupported image format'); + return FALSE; + } + fseek($dfp, strlen($f)); + while (!feof($fp)) { + $max = MAX_IMAGE_SIZE - $written; + if ($max > 65536) { + $max = 65536; + } + $t = fread($fp, $max); + fwrite($dfp, $t); + $written += strlen($t); + if ($written > MAX_IMAGE_SIZE) { + fclose($dfp); + display_form(); + display_error('File too large'); + return FALSE; + } + } + unset($t); + fclose($dfp); + display_loaded(); + $md5 = @md5_file($dfn); + if (empty($md5)) { + display_form(); + display_error('Unable to get the MD5 of the file'); + return FALSE; + } + $cvec = puzzle_fill_cvec_from_file($dfn); + if (empty($cvec)) { + display_form(); + display_error('Unable to compute image signature'); + return FALSE; + } + display_signature_ok(); + save_signature($url, get_client_info(), $md5, $cvec); + + return TRUE; +} + +$url = trim(@$_POST['url']); +if (empty($url)) { + display_form(); + exit(0); +} +if (strlen($url) > MAX_URL_SIZE || + preg_match('£^http://([a-z0-9-]+[.])+[a-z]{2,}/.£i', $url) <= 0) { + display_form(); + display_error('Invalid URL, must be http://...'); + exit(1); +} +display_loading(); +$md5 = FALSE; +$cvec = FALSE; +if (record_url($url, $md5, $cvec) !== TRUE) { + exit(1); +} +$urls = find_similar_pictures($md5, $cvec); +unset($cvec); +display_form(); +display_similar_pictures($urls); + +?> + + diff --git a/deduper/libpuzzle/php/libpuzzle/CREDITS b/deduper/libpuzzle/php/libpuzzle/CREDITS new file mode 100644 index 0000000..bb6ecb3 --- /dev/null +++ b/deduper/libpuzzle/php/libpuzzle/CREDITS @@ -0,0 +1 @@ +Frank DENIS diff --git a/deduper/libpuzzle/php/libpuzzle/EXPERIMENTAL b/deduper/libpuzzle/php/libpuzzle/EXPERIMENTAL new file mode 100644 index 0000000..e69de29 diff --git a/deduper/libpuzzle/php/libpuzzle/LICENSE b/deduper/libpuzzle/php/libpuzzle/LICENSE new file mode 100644 index 0000000..1ce2d05 --- /dev/null +++ b/deduper/libpuzzle/php/libpuzzle/LICENSE @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2007-2015 Frank DENIS + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ diff --git a/deduper/libpuzzle/php/libpuzzle/Makefile.am b/deduper/libpuzzle/php/libpuzzle/Makefile.am new file mode 100644 index 0000000..f582035 --- /dev/null +++ b/deduper/libpuzzle/php/libpuzzle/Makefile.am @@ -0,0 +1,15 @@ +EXTRA_DIST = \ + CREDITS \ + EXPERIMENTAL \ + LICENSE \ + README \ + config.m4 \ + libpuzzle.c \ + libpuzzle.php \ + php_libpuzzle.h + +SUBDIRS = \ + build \ + include \ + modules \ + tests diff --git a/deduper/libpuzzle/php/libpuzzle/README b/deduper/libpuzzle/php/libpuzzle/README new file mode 100644 index 0000000..7bb674f --- /dev/null +++ b/deduper/libpuzzle/php/libpuzzle/README @@ -0,0 +1,4 @@ +This is a PHP extension for libpuzzle. + +Have a look at the README-PHP file on top of the libpuzzle distribution for +more info about that extension. diff --git a/deduper/libpuzzle/php/libpuzzle/build/Makefile.am b/deduper/libpuzzle/php/libpuzzle/build/Makefile.am new file mode 100644 index 0000000..e69de29 diff --git a/deduper/libpuzzle/php/libpuzzle/config.m4 b/deduper/libpuzzle/php/libpuzzle/config.m4 new file mode 100644 index 0000000..84f954a --- /dev/null +++ b/deduper/libpuzzle/php/libpuzzle/config.m4 @@ -0,0 +1,49 @@ +dnl config.m4 for extension libpuzzle + +dnl If your extension references something external, use with: + +PHP_ARG_WITH(libpuzzle, for libpuzzle support, + [ --with-libpuzzle Include libpuzzle support]) + +if test "$PHP_LIBPUZZLE" != "no"; then + for i in $PHP_LIBPUZZLE /usr/local /usr; do + if test -x "$i/bin/gdlib-config"; then + GDLIB_CONFIG=$i/bin/gdlib-config + break + fi + done + GDLIB_LIBS=$($GDLIB_CONFIG --ldflags --libs) + GDLIB_INCS=$($GDLIB_CONFIG --cflags) + + PHP_EVAL_LIBLINE($GDLIB_LIBS, LIBPUZZLE_SHARED_LIBADD) + PHP_EVAL_INCLINE($GDLIB_INCS) + + SEARCH_PATH="/usr/local /usr" # you might want to change this + SEARCH_FOR="/include/puzzle.h" # you most likely want to change this + if test -r $PHP_LIBPUZZLE/$SEARCH_FOR; then # path given as parameter + LIBPUZZLE_DIR=$PHP_LIBPUZZLE + else # search default path list + AC_MSG_CHECKING([for libpuzzle files in default path]) + for i in $SEARCH_PATH ; do + if test -r $i/$SEARCH_FOR; then + LIBPUZZLE_DIR=$i + AC_MSG_RESULT(found in $i) + fi + done + fi + + if test -z "$LIBPUZZLE_DIR"; then + AC_MSG_RESULT([not found]) + AC_MSG_ERROR([Please reinstall the libpuzzle distribution]) + fi + + dnl # --with-libpuzzle -> add include path + PHP_ADD_INCLUDE($LIBPUZZLE_DIR/include) + + PHP_ADD_LIBRARY_WITH_PATH(puzzle, $LIBPUZZLE_DIR/lib, + LIBPUZZLE_SHARED_LIBADD) + + PHP_SUBST(LIBPUZZLE_SHARED_LIBADD) + + PHP_NEW_EXTENSION(libpuzzle, libpuzzle.c, $ext_shared) +fi diff --git a/deduper/libpuzzle/php/libpuzzle/include/Makefile.am b/deduper/libpuzzle/php/libpuzzle/include/Makefile.am new file mode 100644 index 0000000..e69de29 diff --git a/deduper/libpuzzle/php/libpuzzle/libpuzzle.c b/deduper/libpuzzle/php/libpuzzle/libpuzzle.c new file mode 100644 index 0000000..82e84c3 --- /dev/null +++ b/deduper/libpuzzle/php/libpuzzle/libpuzzle.c @@ -0,0 +1,410 @@ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "php.h" +#include "php_ini.h" +#include "ext/standard/info.h" +#include +#include "php_libpuzzle.h" + +ZEND_DECLARE_MODULE_GLOBALS(libpuzzle) + +/* True global resources - no need for thread safety here */ +static int le_libpuzzle; + +/* {{{ libpuzzle_functions[] + */ +zend_function_entry libpuzzle_functions[] = { + PHP_FE(puzzle_set_max_width, NULL) + PHP_FE(puzzle_set_max_height, NULL) + PHP_FE(puzzle_set_lambdas, NULL) + PHP_FE(puzzle_set_noise_cutoff, NULL) + PHP_FE(puzzle_set_p_ratio, NULL) + PHP_FE(puzzle_set_contrast_barrier_for_cropping, NULL) + PHP_FE(puzzle_set_max_cropping_ratio, NULL) + PHP_FE(puzzle_set_autocrop, NULL) + + PHP_FE(puzzle_fill_cvec_from_file, NULL) + PHP_FE(puzzle_compress_cvec, NULL) + PHP_FE(puzzle_uncompress_cvec, NULL) + PHP_FE(puzzle_vector_normalized_distance, NULL) + + {NULL, NULL, NULL} /* Must be the last line in libpuzzle_functions[] */ +}; +/* }}} */ + +/* {{{ libpuzzle_module_entry + */ +zend_module_entry libpuzzle_module_entry = { +#if ZEND_MODULE_API_NO >= 20010901 + STANDARD_MODULE_HEADER, +#endif + "libpuzzle", + libpuzzle_functions, + PHP_MINIT(libpuzzle), + PHP_MSHUTDOWN(libpuzzle), + PHP_RINIT(libpuzzle), /* Replace with NULL if there's nothing to do at request start */ + PHP_RSHUTDOWN(libpuzzle), /* Replace with NULL if there's nothing to do at request end */ + PHP_MINFO(libpuzzle), +#if ZEND_MODULE_API_NO >= 20010901 + "0.10", /* Replace with version number for your extension */ +#endif + STANDARD_MODULE_PROPERTIES +}; +/* }}} */ + +#ifdef COMPILE_DL_LIBPUZZLE +ZEND_GET_MODULE(libpuzzle) +#endif + + +/* {{{ PHP_MINIT_FUNCTION + */ +PHP_MINIT_FUNCTION(libpuzzle) +{ + REGISTER_DOUBLE_CONSTANT("PUZZLE_CVEC_SIMILARITY_THRESHOLD", + PUZZLE_CVEC_SIMILARITY_THRESHOLD, + CONST_CS | CONST_PERSISTENT); + REGISTER_DOUBLE_CONSTANT("PUZZLE_CVEC_SIMILARITY_HIGH_THRESHOLD", + PUZZLE_CVEC_SIMILARITY_HIGH_THRESHOLD, + CONST_CS | CONST_PERSISTENT); + REGISTER_DOUBLE_CONSTANT("PUZZLE_CVEC_SIMILARITY_LOW_THRESHOLD", + PUZZLE_CVEC_SIMILARITY_LOW_THRESHOLD, + CONST_CS | CONST_PERSISTENT); + REGISTER_DOUBLE_CONSTANT("PUZZLE_CVEC_SIMILARITY_LOWER_THRESHOLD", + PUZZLE_CVEC_SIMILARITY_LOWER_THRESHOLD, + CONST_CS | CONST_PERSISTENT); + return SUCCESS; +} +/* }}} */ + +/* {{{ PHP_MSHUTDOWN_FUNCTION + */ +PHP_MSHUTDOWN_FUNCTION(libpuzzle) +{ + return SUCCESS; +} +/* }}} */ + +/* Remove if there's nothing to do at request start */ +/* {{{ PHP_RINIT_FUNCTION + */ +PHP_RINIT_FUNCTION(libpuzzle) +{ + puzzle_init_context(&LIBPUZZLE_G(global_context)); + return SUCCESS; +} +/* }}} */ + +/* Remove if there's nothing to do at request end */ +/* {{{ PHP_RSHUTDOWN_FUNCTION + */ +PHP_RSHUTDOWN_FUNCTION(libpuzzle) +{ + puzzle_free_context(&LIBPUZZLE_G(global_context)); + return SUCCESS; +} +/* }}} */ + +/* {{{ PHP_MINFO_FUNCTION + */ +PHP_MINFO_FUNCTION(libpuzzle) +{ + php_info_print_table_start(); + php_info_print_table_header(2, "libpuzzle support", "enabled"); + php_info_print_table_end(); +} +/* }}} */ + +/* {{{ proto string puzzle_fill_cvec_from_file(string filename) + * Creates a signature out of an image file */ +PHP_FUNCTION(puzzle_fill_cvec_from_file) +{ + char *arg = NULL; + int arg_len; + PuzzleContext *context; + PuzzleCvec cvec; + + context = &LIBPUZZLE_G(global_context); + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, + "s", &arg, &arg_len) == FAILURE || + arg_len <= 0) { + RETURN_FALSE; + } + puzzle_init_cvec(context, &cvec); + if (puzzle_fill_cvec_from_file(context, &cvec, arg) != 0) { + puzzle_free_cvec(context, &cvec); + RETURN_FALSE; + } + RETVAL_STRINGL(cvec.vec, cvec.sizeof_vec, 1); + puzzle_free_cvec(context, &cvec); +} +/* }}} */ + +/* {{{ proto string puzzle_compress_cvec(string cvec) + * Compress a signature to save storage space */ +PHP_FUNCTION(puzzle_compress_cvec) +{ + char *arg = NULL; + int arg_len; + PuzzleContext *context; + PuzzleCompressedCvec compressed_cvec; + PuzzleCvec cvec; + + context = &LIBPUZZLE_G(global_context); + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, + "s", &arg, &arg_len) == FAILURE || + arg_len <= 0) { + RETURN_FALSE; + } + puzzle_init_compressed_cvec(context, &compressed_cvec); + puzzle_init_cvec(context, &cvec); + cvec.vec = arg; + cvec.sizeof_vec = (size_t) arg_len; + if (puzzle_compress_cvec(context, &compressed_cvec, &cvec) != 0) { + puzzle_free_compressed_cvec(context, &compressed_cvec); + cvec.vec = NULL; + puzzle_free_cvec(context, &cvec); + RETURN_FALSE; + } + RETVAL_STRINGL(compressed_cvec.vec, + compressed_cvec.sizeof_compressed_vec, 1); + puzzle_free_compressed_cvec(context, &compressed_cvec); + cvec.vec = NULL; + puzzle_free_cvec(context, &cvec); +} +/* }}} */ + +/* {{{ proto string puzzle_uncompress_cvec(string compressed_cvec) + * Uncompress a compressed signature so that it can be used for computations */ +PHP_FUNCTION(puzzle_uncompress_cvec) +{ + char *arg = NULL; + int arg_len; + PuzzleContext *context; + PuzzleCompressedCvec compressed_cvec; + PuzzleCvec cvec; + + context = &LIBPUZZLE_G(global_context); + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, + "s", &arg, &arg_len) == FAILURE || + arg_len <= 0) { + RETURN_FALSE; + } + puzzle_init_compressed_cvec(context, &compressed_cvec); + puzzle_init_cvec(context, &cvec); + compressed_cvec.vec = arg; + compressed_cvec.sizeof_compressed_vec = (size_t) arg_len; + if (puzzle_uncompress_cvec(context, &compressed_cvec, &cvec) != 0) { + puzzle_free_cvec(context, &cvec); + compressed_cvec.vec = NULL; + puzzle_free_compressed_cvec(context, &compressed_cvec); + RETURN_FALSE; + } + RETVAL_STRINGL(cvec.vec, cvec.sizeof_vec, 1); + puzzle_free_cvec(context, &cvec); + compressed_cvec.vec = NULL; + puzzle_free_compressed_cvec(context, &compressed_cvec); +} +/* }}} */ + +/* {{{ proto double puzzle_vector_normalized_distance(string cvec1, string cvec2 [, bool fix_for_texts]) + * Computes the distance between two signatures. Result is between 0.0 and 1.0 */ +PHP_FUNCTION(puzzle_vector_normalized_distance) +{ + char *vec1 = NULL, *vec2 = NULL; + int vec1_len, vec2_len; + PuzzleContext *context; + PuzzleCvec cvec1, cvec2; + double d; + zend_bool fix_for_texts; + + context = &LIBPUZZLE_G(global_context); + if (zend_parse_parameters + (ZEND_NUM_ARGS() TSRMLS_CC, "ss|b", + &vec1, &vec1_len, &vec2, &vec2_len, &fix_for_texts) == FAILURE || + vec1_len <= 0 || vec2_len <= 0) { + RETURN_FALSE; + } + if (ZEND_NUM_ARGS() TSRMLS_CC < 3) { + fix_for_texts = (zend_bool) 1; + } + puzzle_init_cvec(context, &cvec1); + puzzle_init_cvec(context, &cvec2); + cvec1.vec = vec1; + cvec1.sizeof_vec = (size_t) vec1_len; + cvec2.vec = vec2; + cvec2.sizeof_vec = (size_t) vec2_len; + d = puzzle_vector_normalized_distance(context, &cvec1, &cvec2, + (int) fix_for_texts); + cvec1.vec = cvec2.vec = NULL; + puzzle_free_cvec(context, &cvec1); + puzzle_free_cvec(context, &cvec2); + RETVAL_DOUBLE(d); +} +/* }}} */ + +/* {{{ proto bool puzzle_set_max_width(int width) + * Set the maximum picture width */ +PHP_FUNCTION(puzzle_set_max_width) +{ + PuzzleContext *context; + long width; + + context = &LIBPUZZLE_G(global_context); + if (zend_parse_parameters + (ZEND_NUM_ARGS() TSRMLS_CC, "l", &width) == FAILURE || + width <= 0L || width > INT_MAX) { + RETURN_FALSE; + } + if (puzzle_set_max_width(context, (unsigned int) width) != 0) { + RETURN_FALSE; + } + RETVAL_TRUE; +} +/* }}} */ + +/* {{{ proto bool puzzle_set_max_height(int height) + * Set the maximum picture height */ +PHP_FUNCTION(puzzle_set_max_height) +{ + PuzzleContext *context; + long height; + + context = &LIBPUZZLE_G(global_context); + if (zend_parse_parameters + (ZEND_NUM_ARGS() TSRMLS_CC, "l", &height) == FAILURE || + height <= 0L || height > INT_MAX) { + RETURN_FALSE; + } + if (puzzle_set_max_height(context, (unsigned int) height) != 0) { + RETURN_FALSE; + } + RETVAL_TRUE; +} +/* }}} */ + +/* {{{ proto bool puzzle_set_lambdas(int lambdas) + * Set the size of the computation grid */ +PHP_FUNCTION(puzzle_set_lambdas) +{ + PuzzleContext *context; + long lambdas; + + context = &LIBPUZZLE_G(global_context); + if (zend_parse_parameters + (ZEND_NUM_ARGS() TSRMLS_CC, "l", &lambdas) == FAILURE || + lambdas <= 0L || lambdas > INT_MAX) { + RETURN_FALSE; + } + if (puzzle_set_lambdas(context, (unsigned int) lambdas) != 0) { + RETURN_FALSE; + } + RETVAL_TRUE; +} +/* }}} */ + +/* {{{ proto bool puzzle_set_noise_cutoff(double cutoff) + * Set the noise cutoff level */ +PHP_FUNCTION(puzzle_set_noise_cutoff) +{ + PuzzleContext *context; + double cutoff; + + context = &LIBPUZZLE_G(global_context); + if (zend_parse_parameters + (ZEND_NUM_ARGS() TSRMLS_CC, "d", &cutoff) == FAILURE) { + RETURN_FALSE; + } + if (puzzle_set_noise_cutoff(context, cutoff) != 0) { + RETURN_FALSE; + } + RETVAL_TRUE; +} +/* }}} */ + +/* {{{ proto bool puzzle_set_p_ratio(double ratio) + * Set the p_ratio */ +PHP_FUNCTION(puzzle_set_p_ratio) +{ + PuzzleContext *context; + double p_ratio; + + context = &LIBPUZZLE_G(global_context); + if (zend_parse_parameters + (ZEND_NUM_ARGS() TSRMLS_CC, "d", &p_ratio) == FAILURE) { + RETURN_FALSE; + } + if (puzzle_set_p_ratio(context, p_ratio) != 0) { + RETURN_FALSE; + } + RETVAL_TRUE; +} +/* }}} */ + +/* {{{ proto bool puzzle_set_contrast_barrier_for_cropping(double barrier) + * Set the tolerance level for cropping */ +PHP_FUNCTION(puzzle_set_contrast_barrier_for_cropping) +{ + PuzzleContext *context; + double barrier; + + context = &LIBPUZZLE_G(global_context); + if (zend_parse_parameters + (ZEND_NUM_ARGS() TSRMLS_CC, "d", &barrier) == FAILURE) { + RETURN_FALSE; + } + if (puzzle_set_contrast_barrier_for_cropping(context, barrier) != 0) { + RETURN_FALSE; + } + RETVAL_TRUE; +} +/* }}} */ + +/* {{{ proto bool puzzle_set_max_cropping_ratio(double ratio) + * Set the maximum ratio between the cropped area and the whole picture */ +PHP_FUNCTION(puzzle_set_max_cropping_ratio) +{ + PuzzleContext *context; + double ratio; + + context = &LIBPUZZLE_G(global_context); + if (zend_parse_parameters + (ZEND_NUM_ARGS() TSRMLS_CC, "d", &ratio) == FAILURE) { + RETURN_FALSE; + } + if (puzzle_set_max_cropping_ratio(context, ratio) != 0) { + RETURN_FALSE; + } + RETVAL_TRUE; +} +/* }}} */ + +/* {{{ proto bool puzzle_set_autocrop(bool autocrop) + * TRUE to enable autocropping, FALSE to disable */ +PHP_FUNCTION(puzzle_set_autocrop) +{ + PuzzleContext *context; + zend_bool autocrop; + + context = &LIBPUZZLE_G(global_context); + if (zend_parse_parameters + (ZEND_NUM_ARGS() TSRMLS_CC, "b", &autocrop) == FAILURE) { + RETURN_FALSE; + } + if (puzzle_set_autocrop(context, (int) autocrop) != 0) { + RETURN_FALSE; + } + RETVAL_TRUE; +} +/* }}} */ + +/* + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim600: noet sw=4 ts=4 fdm=marker + * vim<600: noet sw=4 ts=4 + */ diff --git a/deduper/libpuzzle/php/libpuzzle/libpuzzle.php b/deduper/libpuzzle/php/libpuzzle/libpuzzle.php new file mode 100644 index 0000000..415273b --- /dev/null +++ b/deduper/libpuzzle/php/libpuzzle/libpuzzle.php @@ -0,0 +1,21 @@ +"; + +if(!extension_loaded('libpuzzle')) { + dl('libpuzzle.' . PHP_SHLIB_SUFFIX); +} +$module = 'libpuzzle'; +$functions = get_extension_funcs($module); +echo "Functions available in the test extension:$br\n"; +foreach($functions as $func) { + echo $func."$br\n"; +} +echo "$br\n"; +$function = 'confirm_' . $module . '_compiled'; +if (extension_loaded($module)) { + $str = $function($module); +} else { + $str = "Module $module is not compiled into PHP"; +} +echo "$str\n"; +?> diff --git a/deduper/libpuzzle/php/libpuzzle/modules/Makefile.am b/deduper/libpuzzle/php/libpuzzle/modules/Makefile.am new file mode 100644 index 0000000..e69de29 diff --git a/deduper/libpuzzle/php/libpuzzle/php_libpuzzle.h b/deduper/libpuzzle/php/libpuzzle/php_libpuzzle.h new file mode 100644 index 0000000..1fae819 --- /dev/null +++ b/deduper/libpuzzle/php/libpuzzle/php_libpuzzle.h @@ -0,0 +1,66 @@ +#ifndef PHP_LIBPUZZLE_H +#define PHP_LIBPUZZLE_H + +extern zend_module_entry libpuzzle_module_entry; +#define phpext_libpuzzle_ptr &libpuzzle_module_entry + +#ifdef PHP_WIN32 +#define PHP_LIBPUZZLE_API __declspec(dllexport) +#else +#define PHP_LIBPUZZLE_API +#endif + +#ifdef ZTS +#include "TSRM.h" +#endif + +PHP_MINIT_FUNCTION(libpuzzle); +PHP_MSHUTDOWN_FUNCTION(libpuzzle); +PHP_RINIT_FUNCTION(libpuzzle); +PHP_RSHUTDOWN_FUNCTION(libpuzzle); +PHP_MINFO_FUNCTION(libpuzzle); + +PHP_FUNCTION(puzzle_set_max_width); +PHP_FUNCTION(puzzle_set_max_height); +PHP_FUNCTION(puzzle_set_lambdas); +PHP_FUNCTION(puzzle_set_noise_cutoff); +PHP_FUNCTION(puzzle_set_p_ratio); +PHP_FUNCTION(puzzle_set_contrast_barrier_for_cropping); +PHP_FUNCTION(puzzle_set_max_cropping_ratio); +PHP_FUNCTION(puzzle_set_autocrop); + +PHP_FUNCTION(puzzle_fill_cvec_from_file); +PHP_FUNCTION(puzzle_compress_cvec); +PHP_FUNCTION(puzzle_uncompress_cvec); +PHP_FUNCTION(puzzle_vector_normalized_distance); + +ZEND_BEGIN_MODULE_GLOBALS(libpuzzle) + PuzzleContext global_context; +ZEND_END_MODULE_GLOBALS(libpuzzle) + +/* In every utility function you add that needs to use variables + in php_libpuzzle_globals, call TSRMLS_FETCH(); after declaring other + variables used by that function, or better yet, pass in TSRMLS_CC + after the last function argument and declare your utility function + with TSRMLS_DC after the last declared argument. Always refer to + the globals in your function as LIBPUZZLE_G(variable). You are + encouraged to rename these macros something shorter, see + examples in any other php module directory. +*/ + +#ifdef ZTS +#define LIBPUZZLE_G(v) TSRMG(libpuzzle_globals_id, zend_libpuzzle_globals *, v) +#else +#define LIBPUZZLE_G(v) (libpuzzle_globals.v) +#endif + +#endif /* PHP_LIBPUZZLE_H */ + +/* + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim600: noet sw=4 ts=4 fdm=marker + * vim<600: noet sw=4 ts=4 + */ diff --git a/deduper/libpuzzle/php/libpuzzle/tests/001.phpt b/deduper/libpuzzle/php/libpuzzle/tests/001.phpt new file mode 100644 index 0000000..5a5f5b5 --- /dev/null +++ b/deduper/libpuzzle/php/libpuzzle/tests/001.phpt @@ -0,0 +1,10 @@ +--TEST-- +Check for libpuzzle presence +--SKIPIF-- + +--FILE-- + +--EXPECT-- +libpuzzle extension is available diff --git a/deduper/libpuzzle/php/libpuzzle/tests/002.phpt b/deduper/libpuzzle/php/libpuzzle/tests/002.phpt new file mode 100644 index 0000000..d675145 --- /dev/null +++ b/deduper/libpuzzle/php/libpuzzle/tests/002.phpt @@ -0,0 +1,15 @@ +--TEST-- +Check for distance between similar images +--SKIPIF-- + +--FILE-- + +--EXPECT-- +1 diff --git a/deduper/libpuzzle/php/libpuzzle/tests/003.phpt b/deduper/libpuzzle/php/libpuzzle/tests/003.phpt new file mode 100644 index 0000000..ba7d5aa --- /dev/null +++ b/deduper/libpuzzle/php/libpuzzle/tests/003.phpt @@ -0,0 +1,24 @@ +--TEST-- +Check the puzzle_set(3) interface +--SKIPIF-- + +--FILE-- + +--EXPECT-- +1 diff --git a/deduper/libpuzzle/php/libpuzzle/tests/Makefile.am b/deduper/libpuzzle/php/libpuzzle/tests/Makefile.am new file mode 100644 index 0000000..14ded39 --- /dev/null +++ b/deduper/libpuzzle/php/libpuzzle/tests/Makefile.am @@ -0,0 +1,7 @@ +EXTRA_DIST = \ + 001.phpt \ + 002.phpt \ + 003.phpt + +SUBDIRS = \ + pics diff --git a/deduper/libpuzzle/php/libpuzzle/tests/pics/Makefile.am b/deduper/libpuzzle/php/libpuzzle/tests/pics/Makefile.am new file mode 100644 index 0000000..0aacd9a --- /dev/null +++ b/deduper/libpuzzle/php/libpuzzle/tests/pics/Makefile.am @@ -0,0 +1,3 @@ +EXTRA_DIST = \ + pic-a-0.jpg \ + pic-a-1.jpg diff --git a/deduper/libpuzzle/php/libpuzzle/tests/pics/pic-a-0.jpg b/deduper/libpuzzle/php/libpuzzle/tests/pics/pic-a-0.jpg new file mode 100644 index 0000000..3dd4a3b Binary files /dev/null and b/deduper/libpuzzle/php/libpuzzle/tests/pics/pic-a-0.jpg differ diff --git a/deduper/libpuzzle/php/libpuzzle/tests/pics/pic-a-1.jpg b/deduper/libpuzzle/php/libpuzzle/tests/pics/pic-a-1.jpg new file mode 100644 index 0000000..95f0e77 Binary files /dev/null and b/deduper/libpuzzle/php/libpuzzle/tests/pics/pic-a-1.jpg differ diff --git a/deduper/libpuzzle/src/CMakeLists.txt b/deduper/libpuzzle/src/CMakeLists.txt new file mode 100644 index 0000000..634ef38 --- /dev/null +++ b/deduper/libpuzzle/src/CMakeLists.txt @@ -0,0 +1,21 @@ +project(puzzle C) + +include(FindPkgConfig) +pkg_search_module(gdlib REQUIRED gdlib) + +add_library(puzzle STATIC + globals.h + puzzle_common.h + puzzle_p.h + puzzle.h + compress.c + cvec.c + dvec.c + puzzle.c + tunables.c + vector_ops.c +) +target_include_directories(puzzle + PRIVATE + ${gdlib_INCLUDE_DIRS} +) diff --git a/deduper/libpuzzle/src/Makefile.am b/deduper/libpuzzle/src/Makefile.am new file mode 100644 index 0000000..3016925 --- /dev/null +++ b/deduper/libpuzzle/src/Makefile.am @@ -0,0 +1,72 @@ +lib_LTLIBRARIES = \ + libpuzzle.la + +libpuzzle_la_LDFLAGS = -version-info 1:0 + +libpuzzle_la_SOURCES = \ + puzzle.c \ + tunables.c \ + dvec.c \ + cvec.c \ + compress.c \ + vector_ops.c \ + puzzle_common.h \ + puzzle_p.h \ + globals.h \ + puzzle.h + +include_HEADERS = \ + puzzle.h + +noinst_HEADERS = \ + puzzle_common.h \ + puzzle_p.h \ + globals.h + +bin_PROGRAMS = \ + puzzle-diff + +puzzle_diff_SOURCES = \ + puzzle-diff.c \ + puzzle_common.h \ + puzzle.h + +puzzle_diff_LDADD = \ + libpuzzle.la + +TESTS = \ + regress_1 \ + regress_2 \ + regress_3 + +check_PROGRAMS = \ + regress_1 \ + regress_2 \ + regress_3 + +regress_1_SOURCES = \ + regress_1.c \ + puzzle_common.h \ + puzzle.h + +regress_2_SOURCES = \ + regress_2.c \ + puzzle_common.h \ + puzzle.h + +regress_3_SOURCES = \ + regress_3.c \ + puzzle_common.h \ + puzzle.h + +regress_1_LDADD = \ + libpuzzle.la + +regress_2_LDADD = \ + libpuzzle.la + +regress_3_LDADD = \ + libpuzzle.la + +SUBDIRS = \ + pics diff --git a/deduper/libpuzzle/src/compress.c b/deduper/libpuzzle/src/compress.c new file mode 100644 index 0000000..e71da95 --- /dev/null +++ b/deduper/libpuzzle/src/compress.c @@ -0,0 +1,125 @@ +#include "puzzle_common.h" +#include "puzzle_p.h" +#include "puzzle.h" +#include "globals.h" + +void puzzle_init_compressed_cvec(PuzzleContext * const context, + PuzzleCompressedCvec * const compressed_cvec) +{ + (void) context; + compressed_cvec->sizeof_compressed_vec = (size_t) 0U; + compressed_cvec->vec = NULL; +} + +void puzzle_free_compressed_cvec(PuzzleContext * const context, + PuzzleCompressedCvec * const compressed_cvec) +{ + (void) context; + free(compressed_cvec->vec); + compressed_cvec->vec = NULL; +} + +int puzzle_compress_cvec(PuzzleContext * const context, + PuzzleCompressedCvec * const compressed_cvec, + const PuzzleCvec * const cvec) +{ +#define PC_NM(X) ((unsigned char) ((X) + 2)) + size_t remaining = cvec->sizeof_vec; + const signed char *ptr; + unsigned char *cptr; + + (void) context; + compressed_cvec->sizeof_compressed_vec = + (cvec->sizeof_vec + (size_t) 2U) / (size_t) 3U; + if ((compressed_cvec->vec = + calloc(compressed_cvec->sizeof_compressed_vec, + sizeof *compressed_cvec->vec)) == NULL) { + return -1; + } + ptr = cvec->vec; + cptr = compressed_cvec->vec; + while (remaining >= (size_t) 3U) { + *cptr++ = PC_NM(ptr[0]) + PC_NM(ptr[1]) * 5U + + PC_NM(ptr[2]) * (5U * 5U); + ptr += 3U; + remaining -= 3U; + } + if (remaining == (size_t) 1U) { + *cptr++ = PC_NM(ptr[0]); + compressed_cvec->vec[0] |= 128U; + } else if (remaining == (size_t) 2U) { + *cptr++ = PC_NM(ptr[0]) + PC_NM(ptr[1]) * 5U; + if (compressed_cvec->sizeof_compressed_vec < (size_t) 2U) { + puzzle_err_bug(__FILE__, __LINE__); + } + compressed_cvec->vec[1] |= 128U; + } + if ((size_t) (cptr - compressed_cvec->vec) != + compressed_cvec->sizeof_compressed_vec) { + puzzle_err_bug(__FILE__, __LINE__); + } + return 0; +} + +int puzzle_uncompress_cvec(PuzzleContext * const context, + const PuzzleCompressedCvec * const compressed_cvec, + PuzzleCvec * const cvec) +{ +#define PC_FL(X) ((X) & 127U) +#define PC_NP(X) ((signed char) (X) - 2) + + size_t remaining; + unsigned char trailing_bits; + const unsigned char *cptr = compressed_cvec->vec; + signed char *ptr; + unsigned char c; + + (void) context; + if (cvec->vec != NULL) { + puzzle_err_bug(__FILE__, __LINE__); + } + if ((remaining = compressed_cvec->sizeof_compressed_vec) < (size_t) 2U) { + puzzle_err_bug(__FILE__, __LINE__); + } + trailing_bits = ((cptr[0] & 128U) >> 7) | ((cptr[1] & 128U) >> 6); + if (trailing_bits > 2U) { + puzzle_err_bug(__FILE__, __LINE__); + } + cvec->sizeof_vec = (size_t) 3U * + (compressed_cvec->sizeof_compressed_vec - trailing_bits) + + trailing_bits; + if (compressed_cvec->sizeof_compressed_vec > + SIZE_MAX / (size_t) 3U - (size_t) 2U) { + puzzle_err_bug(__FILE__, __LINE__); + } + if ((cvec->vec = calloc(cvec->sizeof_vec, sizeof *cvec->vec)) == NULL) { + return -1; + } + if (trailing_bits != 0U) { + if (remaining <= (size_t) 0U) { + puzzle_err_bug(__FILE__, __LINE__); + } + remaining--; + } + ptr = cvec->vec; + while (remaining > (size_t) 0U) { + c = PC_FL(*cptr++); + *ptr++ = PC_NP(c % 5U); + c /= 5U; + *ptr++ = PC_NP(c % 5U); + c /= 5U; + *ptr++ = PC_NP(c % 5U); + remaining--; + } + if (trailing_bits == 1U) { + *ptr++ = PC_NP(PC_FL(*cptr) % 5U); + } else if (trailing_bits == 2U) { + c = PC_FL(*cptr); + *ptr++ = PC_NP(c % 5U); + *ptr++ = PC_NP(c / 5U % 5U); + } + if ((size_t) (ptr - cvec->vec) != cvec->sizeof_vec) { + puzzle_err_bug(__FILE__, __LINE__); + } + return 0; +} diff --git a/deduper/libpuzzle/src/cvec.c b/deduper/libpuzzle/src/cvec.c new file mode 100644 index 0000000..482b445 --- /dev/null +++ b/deduper/libpuzzle/src/cvec.c @@ -0,0 +1,202 @@ +#include "puzzle_common.h" +#include "puzzle_p.h" +#include "puzzle.h" +#include "globals.h" + +static int puzzle_median_cmp(const void * const a_, const void * const b_) +{ + const double a = * (const double *) a_; + const double b = * (const double *) b_; + + if (a < b) { + return -1; + } else if (a > b) { + return 1; + } + return 0; +} + +static double puzzle_median(double * const vec, size_t size) +{ + size_t n; + size_t o; + double avg; + + if (size <= (size_t) 0U) { + return 0.0; + } + qsort((void *) vec, size, sizeof *vec, puzzle_median_cmp); + if ((n = size / (size_t) 2U) == (size_t) 0U) { + if (size > (size_t) 1U) { + o = (size_t) 1U; + } else { + o = (size_t) 0U; + } + } else { + o = n + (size_t) 1U; + } + if (o < n) { + puzzle_err_bug(__FILE__, __LINE__); + } + avg = (vec[n] + vec[o]) / 2.0; + if (avg < vec[n] || avg > vec[o]) { + avg = vec[n]; + } + return avg; +} + +int puzzle_fill_cvec_from_dvec(PuzzleContext * const context, + PuzzleCvec * const cvec, + const PuzzleDvec * const dvec) +{ + size_t s; + const double *dvecptr; + signed char *cvecptr; + double *lights = NULL, *darks = NULL; + size_t pos_lights = (size_t) 0U, pos_darks = (size_t) 0U; + size_t sizeof_lights, sizeof_darks; + double lighter_cutoff, darker_cutoff; + int err = 0; + double dv; + + if ((cvec->sizeof_vec = dvec->sizeof_compressed_vec) <= (size_t) 0U) { + puzzle_err_bug(__FILE__, __LINE__); + } + if ((cvec->vec = calloc(cvec->sizeof_vec, sizeof *cvec->vec)) == NULL) { + return -1; + } + sizeof_lights = sizeof_darks = cvec->sizeof_vec; + if ((lights = calloc(sizeof_lights, sizeof *lights)) == NULL || + (darks = calloc(sizeof_darks, sizeof *darks)) == NULL) { + err = -1; + goto out; + } + dvecptr = dvec->vec; + s = cvec->sizeof_vec; + do { + dv = *dvecptr++; + if (dv >= - context->puzzle_noise_cutoff && + dv <= context->puzzle_noise_cutoff) { + continue; + } + if (dv < context->puzzle_noise_cutoff) { + darks[pos_darks++] = dv; + if (pos_darks > sizeof_darks) { + puzzle_err_bug(__FILE__, __LINE__); + } + } else if (dv > context->puzzle_noise_cutoff) { + lights[pos_lights++] = dv; + if (pos_lights > sizeof_lights) { + puzzle_err_bug(__FILE__, __LINE__); + } + } + } while (--s != (size_t) 0U); + lighter_cutoff = puzzle_median(lights, pos_lights); + darker_cutoff = puzzle_median(darks, pos_darks); + free(lights); + lights = NULL; + free(darks); + darks = NULL; + dvecptr = dvec->vec; + cvecptr = cvec->vec; + s = cvec->sizeof_vec; + do { + dv = *dvecptr++; + if (dv >= - context->puzzle_noise_cutoff && + dv <= context->puzzle_noise_cutoff) { + *cvecptr++ = 0; + } else if (dv < 0.0) { + *cvecptr++ = dv < darker_cutoff ? -2 : -1; + } else { + *cvecptr++ = dv > lighter_cutoff ? +2 : +1; + } + } while (--s != (size_t) 0U); + if ((size_t) (cvecptr - cvec->vec) != cvec->sizeof_vec) { + puzzle_err_bug(__FILE__, __LINE__); + } + out: + free(lights); + free(darks); + + return err; +} + +void puzzle_init_cvec(PuzzleContext * const context, PuzzleCvec * const cvec) +{ + (void) context; + cvec->sizeof_vec = (size_t) 0U; + cvec->vec = NULL; +} + +void puzzle_free_cvec(PuzzleContext * const context, PuzzleCvec * const cvec) +{ + (void) context; + free(cvec->vec); + cvec->vec = NULL; +} + +int puzzle_dump_cvec(PuzzleContext * const context, + const PuzzleCvec * const cvec) +{ + size_t s = cvec->sizeof_vec; + const signed char *vecptr = cvec->vec; + + (void) context; + if (s <= (size_t) 0U) { + puzzle_err_bug(__FILE__, __LINE__); + } + do { + printf("%d\n", *vecptr++); + } while (--s != (size_t) 0U); + + return 0; +} + +int puzzle_cvec_cksum(PuzzleContext * const context, + const PuzzleCvec * const cvec, unsigned int * const sum) +{ + size_t s = cvec->sizeof_vec; + const signed char *vecptr = cvec->vec; + + (void) context; + *sum = 5381; + do { + *sum += *sum << 5; + *sum ^= (unsigned int) *vecptr++; + } while (--s != (size_t) 0U); + + return 0; +} + +int puzzle_fill_cvec_from_file(PuzzleContext * const context, + PuzzleCvec * const cvec, + const char * const file) +{ + PuzzleDvec dvec; + int ret; + + puzzle_init_dvec(context, &dvec); + if ((ret = puzzle_fill_dvec_from_file(context, &dvec, file)) == 0) { + ret = puzzle_fill_cvec_from_dvec(context, cvec, &dvec); + } + puzzle_free_dvec(context, &dvec); + + return ret; +} + +int puzzle_fill_cvec_from_mem(PuzzleContext * const context, + PuzzleCvec * const cvec, + const void * const mem, + const size_t size) +{ + PuzzleDvec dvec; + int ret; + + puzzle_init_dvec(context, &dvec); + if ((ret = puzzle_fill_dvec_from_mem(context, &dvec, mem, size)) == 0) { + ret = puzzle_fill_cvec_from_dvec(context, cvec, &dvec); + } + puzzle_free_dvec(context, &dvec); + + return ret; +} diff --git a/deduper/libpuzzle/src/dvec.c b/deduper/libpuzzle/src/dvec.c new file mode 100644 index 0000000..f5d21f9 --- /dev/null +++ b/deduper/libpuzzle/src/dvec.c @@ -0,0 +1,663 @@ +#include "puzzle_common.h" +#include "puzzle_p.h" +#include "puzzle.h" +#include "globals.h" + +static void puzzle_init_view(PuzzleView * const view) +{ + view->width = view->height = 0U; + view->sizeof_map = (size_t) 0U; + view->map = NULL; +} + +static void puzzle_free_view(PuzzleView * const view) +{ + free(view->map); + view->map = NULL; +} + +static void puzzle_init_avglvls(PuzzleAvgLvls * const avglvls) +{ + avglvls->lambdas = 0U; + avglvls->sizeof_lvls = (size_t) 0U; + avglvls->lvls = NULL; +} + +static void puzzle_free_avglvls(PuzzleAvgLvls * const avglvls) +{ + free(avglvls->lvls); + avglvls->lvls = NULL; +} + +void puzzle_init_dvec(PuzzleContext * const context, PuzzleDvec * const dvec) +{ + (void) context; + dvec->sizeof_vec = dvec->sizeof_compressed_vec = (size_t) 0U; + dvec->vec = NULL; +} + +void puzzle_free_dvec(PuzzleContext * const context, PuzzleDvec * const dvec) +{ + (void) context; + free(dvec->vec); + dvec->vec = NULL; +} + +#define MAX_SIGNATURE_LENGTH 8U + +static PuzzleImageTypeCode puzzle_get_image_type_from_header(const unsigned char * const header) +{ + static const PuzzleImageType image_types[] = { + { (size_t) 4U, (const unsigned char *) + "GIF8", PUZZLE_IMAGE_TYPE_GIF }, + { (size_t) 3U, (const unsigned char *) + "\xff\xd8\xff", PUZZLE_IMAGE_TYPE_JPEG }, + { (size_t) 8U, (const unsigned char *) + "\x89PNG\r\n\x1a\n", PUZZLE_IMAGE_TYPE_PNG }, + { (size_t) 0U, NULL, PUZZLE_IMAGE_TYPE_UNKNOWN } + }; + const PuzzleImageType *image_type = image_types; + PuzzleImageTypeCode ret = PUZZLE_IMAGE_TYPE_UNKNOWN; + do { + if (image_type->sizeof_signature > MAX_SIGNATURE_LENGTH) { + puzzle_err_bug(__FILE__, __LINE__); + } + if (memcmp(header, image_type->signature, + image_type->sizeof_signature) == 0) { + ret = image_type->image_type_code; + break; + } + image_type++; + } while (image_type->signature != NULL); + return ret; +} + +static PuzzleImageTypeCode puzzle_get_image_type_from_fp(FILE * const fp) +{ + unsigned char header[MAX_SIGNATURE_LENGTH]; + PuzzleImageTypeCode ret = PUZZLE_IMAGE_TYPE_ERROR; + fpos_t pos; + + if (fgetpos(fp, &pos) != 0) { + return PUZZLE_IMAGE_TYPE_ERROR; + } + rewind(fp); + if (fread(header, (size_t) 1U, sizeof header, fp) != sizeof header) { + goto bye; + } + ret = puzzle_get_image_type_from_header(header); + bye: + if (fsetpos(fp, &pos) != 0) { + puzzle_err_bug(__FILE__, __LINE__); + } + return ret; +} + +static int puzzle_autocrop_axis(PuzzleContext * const context, + PuzzleView * const view, + unsigned int * const crop0, + unsigned int * const crop1, + const unsigned int axisn, + const unsigned int axiso, + const int omaptrinc, const int nmaptrinc) +{ + double *chunk_contrasts; + size_t sizeof_chunk_contrasts; + double chunk_contrast = 0.0, total_contrast = 0.0, barrier_contrast; + unsigned char level = 0U; + unsigned char previous_level = 0U; + unsigned int chunk_n, chunk_o; + unsigned int chunk_n1, chunk_o1; + unsigned int max_crop; + const unsigned char *maptr; + + chunk_n1 = axisn - 1U; + chunk_o1 = axiso - 1U; + *crop0 = 0U; + *crop1 = chunk_n1; + if (axisn < (unsigned int) PUZZLE_MIN_SIZE_FOR_CROPPING || + axiso < (unsigned int) PUZZLE_MIN_SIZE_FOR_CROPPING) { + return 1; + } + sizeof_chunk_contrasts = chunk_n1 + 1U; + if ((chunk_contrasts = calloc(sizeof_chunk_contrasts, + sizeof *chunk_contrasts)) == NULL) { + return -1; + } + maptr = view->map; + if (axisn >= INT_MAX || axiso >= INT_MAX) { + puzzle_err_bug(__FILE__, __LINE__); + } + if (INT_MAX / axisn < axiso) { + puzzle_err_bug(__FILE__, __LINE__); + } + chunk_n = chunk_n1; + do { + chunk_contrast = 0.0; + chunk_o = chunk_o1; + previous_level = *maptr; + do { + level = *maptr; + if (previous_level > level) { + chunk_contrast += (double) (previous_level - level); + } else { + chunk_contrast += (double) (level - previous_level); + } + previous_level = level; + maptr += omaptrinc; + } while (chunk_o-- != 0U); + chunk_contrasts[chunk_n] = chunk_contrast; + total_contrast += chunk_contrast; + maptr += nmaptrinc; + } while (chunk_n-- != 0U); + barrier_contrast = + total_contrast * context->puzzle_contrast_barrier_for_cropping; + total_contrast = 0.0; + *crop0 = 0U; + do { + total_contrast += chunk_contrasts[*crop0]; + if (total_contrast >= barrier_contrast) { + break; + } + } while ((*crop0)++ < chunk_n1); + total_contrast = 0.0; + *crop1 = chunk_n1; + do { + total_contrast += chunk_contrasts[*crop1]; + if (total_contrast >= barrier_contrast) { + break; + } + } while ((*crop1)-- > 0U); + free(chunk_contrasts); + if (*crop0 > chunk_n1 || *crop1 > chunk_n1) { + puzzle_err_bug(__FILE__, __LINE__); + } + max_crop = (unsigned int) + round((double) chunk_n1 * context->puzzle_max_cropping_ratio); + if (max_crop > chunk_n1) { + puzzle_err_bug(__FILE__, __LINE__); + } + *crop0 = MIN(*crop0, max_crop); + *crop1 = MAX(*crop1, chunk_n1 - max_crop); + + return 0; +} + +static int puzzle_autocrop_view(PuzzleContext * context, + PuzzleView * const view) +{ + unsigned int cropx0, cropx1; + unsigned int cropy0, cropy1; + unsigned int x, y; + unsigned char *maptr; + + if (puzzle_autocrop_axis(context, view, &cropx0, &cropx1, + view->width, view->height, + (int) view->width, + 1 - (int) (view->width * view->height)) < 0 || + puzzle_autocrop_axis(context, view, &cropy0, &cropy1, + view->height, view->width, + 1, 0) < 0) { + return -1; + } + if (cropx0 > cropx1 || cropy0 > cropy1) { + puzzle_err_bug(__FILE__, __LINE__); + } + maptr = view->map; + y = cropy0; + do { + x = cropx0; + do { + *maptr++ = PUZZLE_VIEW_PIXEL(view, x, y); + } while (x++ != cropx1); + } while (y++ != cropy1); + view->width = cropx1 - cropx0 + 1U; + view->height = cropy1 - cropy0 + 1U; + view->sizeof_map = (size_t) view->width * (size_t) view->height; + if (view->width <= 0U || view->height <= 0U || + SIZE_MAX / view->width < view->height) { + puzzle_err_bug(__FILE__, __LINE__); + } + return 0; +} + +static int puzzle_getview_from_gdimage(PuzzleContext * const context, + PuzzleView * const view, + gdImagePtr gdimage) +{ + unsigned int x, y; + const unsigned int x0 = 0U, y0 = 0U; + unsigned int x1, y1; + unsigned char *maptr; + int pixel; + + view->map = NULL; + view->width = (unsigned int) gdImageSX(gdimage); + view->height = (unsigned int) gdImageSY(gdimage); + view->sizeof_map = (size_t) (view->width * view->height); + if (view->width > context->puzzle_max_width || + view->height > context->puzzle_max_height) { + return -1; + } + if (view->sizeof_map <= (size_t) 0U || + INT_MAX / view->width < view->height || + SIZE_MAX / view->width < view->height || + (unsigned int) view->sizeof_map != view->sizeof_map) { + puzzle_err_bug(__FILE__, __LINE__); + } + x1 = view->width - 1U; + y1 = view->height - 1U; + if (view->width <= 0U || view->height <= 0U) { + puzzle_err_bug(__FILE__, __LINE__); + } + if ((view->map = calloc(view->sizeof_map, sizeof *view->map)) == NULL) { + return -1; + } + if (x1 > INT_MAX || y1 > INT_MAX) { /* GD uses "int" for coordinates */ + puzzle_err_bug(__FILE__, __LINE__); + } + maptr = view->map; + x = x1; + if (gdImageTrueColor(gdimage) != 0) { + do { + y = y1; + do { + pixel = gdImageGetTrueColorPixel(gdimage, (int) x, (int) y); + *maptr++ = (unsigned char) + ((gdTrueColorGetRed(pixel) * 77 + + gdTrueColorGetGreen(pixel) * 151 + + gdTrueColorGetBlue(pixel) * 28 + 128) / 256); + } while (y-- != y0); + } while (x-- != x0); + } else { + do { + y = y1; + do { + pixel = gdImagePalettePixel(gdimage, x, y); + *maptr++ = (unsigned char) + ((gdimage->red[pixel] * 77 + + gdimage->green[pixel] * 151 + + gdimage->blue[pixel] * 28 + 128) / 256); + } while (y-- != y0); + } while (x-- != x0); + } + return 0; +} + +static double puzzle_softedgedlvl(const PuzzleView * const view, + const unsigned int x, const unsigned int y) +{ + unsigned int lvl = 0U; + unsigned int ax, ay; + unsigned int count = 0U; + const unsigned int xlimit = x + PUZZLE_PIXEL_FUZZ_SIZE; + const unsigned int ylimit = y + PUZZLE_PIXEL_FUZZ_SIZE; + if (x >= view->width || y >= view->height || xlimit <= x || ylimit <= y) { + puzzle_err_bug(__FILE__, __LINE__); + } + if (x > PUZZLE_PIXEL_FUZZ_SIZE) { + ax = x - PUZZLE_PIXEL_FUZZ_SIZE; + } else { + ax = 0U; + } + do { + if (ax >= view->width) { + break; + } + if (y > PUZZLE_PIXEL_FUZZ_SIZE) { + ay = y - PUZZLE_PIXEL_FUZZ_SIZE; + } else { + ay = 0U; + } + do { + if (ay >= view->height) { + break; + } + count++; + lvl += (unsigned int) PUZZLE_VIEW_PIXEL(view, ax, ay); + } while (ay++ < ylimit); + } while (ax++ < xlimit); + if (count <= 0U) { + return 0.0; + } + return (double) lvl / (double) count; +} + +static double puzzle_get_avglvl(const PuzzleView * const view, + const unsigned int x, const unsigned int y, + const unsigned int width, + const unsigned int height) +{ + double lvl = 0.0; + const unsigned int xlimit = x + width - 1U; + const unsigned int ylimit = y + height - 1U; + unsigned int ax, ay; + + if (width <= 0U || height <= 0U) { + puzzle_err_bug(__FILE__, __LINE__); + } + if (xlimit < x || ylimit < y) { + puzzle_err_bug(__FILE__, __LINE__); + } + ax = x; + do { + if (ax >= view->width) { + puzzle_err_bug(__FILE__, __LINE__); + } + ay = y; + do { + if (ay >= view->height) { + puzzle_err_bug(__FILE__, __LINE__); + } + lvl += puzzle_softedgedlvl(view, ax, ay); + } while (ay++ < ylimit); + } while (ax++ < xlimit); + + return lvl / (double) (width * height); +} + +static int puzzle_fill_avglgls(PuzzleContext * const context, + PuzzleAvgLvls * const avglvls, + const PuzzleView * const view, + const unsigned int lambdas) +{ + double width = (double) view->width; + double height = (double) view->height; + double xshift, yshift; + double x, y; + unsigned int p; + unsigned int lx, ly; + unsigned int xd, yd; + unsigned int px, py; + unsigned int lwidth, lheight; + double avglvl; + + avglvls->lambdas = lambdas; + avglvls->sizeof_lvls = (size_t) lambdas * lambdas; + if (UINT_MAX / lambdas < lambdas || + (unsigned int) avglvls->sizeof_lvls != avglvls->sizeof_lvls) { + puzzle_err_bug(__FILE__, __LINE__); + } + if ((avglvls->lvls = calloc(avglvls->sizeof_lvls, + sizeof *avglvls->lvls)) == NULL) { + return -1; + } + xshift = (width - + (width * (double) lambdas / (double) SUCC(lambdas))) / 2.0; + yshift = (height - + (height * (double) lambdas / (double) SUCC(lambdas))) / 2.0; + p = (unsigned int) round(MIN(width, height) / + (SUCC(lambdas) * context->puzzle_p_ratio)); + if (p < PUZZLE_MIN_P) { + p = PUZZLE_MIN_P; + } + lx = 0U; + do { + ly = 0U; + do { + x = xshift + (double) lx * PRED(width) / SUCC(lambdas); + y = yshift + (double) ly * PRED(height) / SUCC(lambdas); + lwidth = (unsigned int) round + (xshift + (double) SUCC(lx) * PRED(width) / + (double) SUCC(lambdas) - x); + lheight = (unsigned int) round + (yshift + (double) SUCC(ly) * PRED(height) / + (double) SUCC(lambdas) - y); + if (p < lwidth) { + xd = (unsigned int) round(x + (lwidth - p) / 2.0); + } else { + xd = (unsigned int) round(x); + } + if (p < lheight) { + yd = (unsigned int) round(y + (lheight - p) / 2.0); + } else { + yd = (unsigned int) round(y); + } + if (view->width - xd < p) { + px = 1U; + } else { + px = p; + } + if (view->height - yd < p) { + py = 1U; + } else { + py = p; + } + if (px > 0U && py > 0U) { + avglvl = puzzle_get_avglvl(view, xd, yd, px, py); + } else { + avglvl = 0.0; + } + PUZZLE_AVGLVL(avglvls, lx, ly) = avglvl; + } while (++ly < lambdas); + } while (++lx < lambdas); + + return 0; +} + +static unsigned int puzzle_add_neighbors(double ** const vecur, + const unsigned int max_neighbors, + const PuzzleAvgLvls * const avglvls, + const unsigned int lx, + const unsigned int ly) +{ + unsigned int ax, ay; + unsigned int xlimit, ylimit; + unsigned int neighbors = 0U; + const double ref = PUZZLE_AVGLVL(avglvls, lx, ly); + + if (max_neighbors != 8U) { + puzzle_err_bug(__FILE__, __LINE__); + } + if (lx >= avglvls->lambdas - 1U) { + xlimit = avglvls->lambdas - 1U; + } else { + xlimit = lx + 1U; + } + if (ly >= avglvls->lambdas - 1U) { + ylimit = avglvls->lambdas - 1U; + } else { + ylimit = ly + 1U; + } + if (lx <= 0U) { + ax = 0U; + } else { + ax = lx - 1U; + } + do { + if (ly <= 0U) { + ay = 0U; + } else { + ay = ly - 1U; + } + do { + if (ax == lx && ay == ly) { + continue; + } + *(*vecur)++ = ref - PUZZLE_AVGLVL(avglvls, ax, ay); + neighbors++; + if (neighbors <= 0U) { + puzzle_err_bug(__FILE__, __LINE__); + } + } while (ay++ < ylimit); + } while (ax++ < xlimit); + if (neighbors > max_neighbors) { + puzzle_err_bug(__FILE__, __LINE__); + } + return neighbors; +} + +static int puzzle_fill_dvec(PuzzleDvec * const dvec, + const PuzzleAvgLvls * const avglvls) +{ + unsigned int lambdas; + unsigned int lx, ly; + double *vecur; + + lambdas = avglvls->lambdas; + dvec->sizeof_compressed_vec = (size_t) 0U; + dvec->sizeof_vec = (size_t) (lambdas * lambdas * PUZZLE_NEIGHBORS); + if (SIZE_MAX / + ((size_t) (lambdas * lambdas)) < (size_t) PUZZLE_NEIGHBORS || + (unsigned int) dvec->sizeof_vec != dvec->sizeof_vec) { + puzzle_err_bug(__FILE__, __LINE__); + } + if ((dvec->vec = calloc(dvec->sizeof_vec, sizeof *dvec->vec)) == NULL) { + return -1; + } + vecur = dvec->vec; + lx = 0U; + do { + ly = 0U; + do { + (void) puzzle_add_neighbors(&vecur, PUZZLE_NEIGHBORS, + avglvls, lx, ly); + } while (++ly < lambdas); + } while (++lx < lambdas); + dvec->sizeof_compressed_vec = (size_t) (vecur - dvec->vec); + + return 0; +} + +static void puzzle_remove_transparency(gdImagePtr gdimage) +{ + int background = gdTrueColor(255, 255, 255); + int x, y, cpix; + + gdImagePaletteToTrueColor(gdimage); + + for (y = 0; y < gdImageSY(gdimage); y++) { + for (x = 0; x < gdImageSX(gdimage); x++) { + cpix = gdImageGetTrueColorPixel(gdimage, x, y); + gdImageSetPixel(gdimage, x, y, gdAlphaBlend(background, cpix)); + } + } +} + +static gdImagePtr puzzle_create_gdimage_from_file(const char * const file) +{ + gdImagePtr gdimage = NULL; + FILE *fp; + PuzzleImageTypeCode image_type_code; + if ((fp = fopen(file, "rb")) == NULL) { + return NULL; + } + image_type_code = puzzle_get_image_type_from_fp(fp); + switch (image_type_code) { + case PUZZLE_IMAGE_TYPE_JPEG: + gdimage = gdImageCreateFromJpeg(fp); + break; + case PUZZLE_IMAGE_TYPE_PNG: + gdimage = gdImageCreateFromPng(fp); + break; + case PUZZLE_IMAGE_TYPE_GIF: + gdimage = gdImageCreateFromGif(fp); + break; + default: + gdimage = NULL; + } + (void) fclose(fp); + return gdimage; +} + +static gdImagePtr puzzle_create_gdimage_from_mem(const void * const mem, const size_t size) +{ + gdImagePtr gdimage = NULL; + PuzzleImageTypeCode image_type_code = puzzle_get_image_type_from_header(mem); + switch (image_type_code) { + case PUZZLE_IMAGE_TYPE_JPEG: + gdimage = gdImageCreateFromJpegPtr(size, (void *)mem); + break; + case PUZZLE_IMAGE_TYPE_PNG: + gdimage = gdImageCreateFromPngPtr(size, (void *)mem); + break; + case PUZZLE_IMAGE_TYPE_GIF: + gdimage = gdImageCreateFromGifPtr(size, (void *)mem); + break; + default: + gdimage = NULL; + } + return gdimage; +} + +static int puzzle_fill_dvec_from_gdimage(PuzzleContext * const context, + PuzzleDvec * const dvec, + const gdImagePtr gdimage) +{ + PuzzleView view; + PuzzleAvgLvls avglvls; + int ret = 0; + + if (context->magic != PUZZLE_CONTEXT_MAGIC) { + puzzle_err_bug(__FILE__, __LINE__); + } + puzzle_init_view(&view); + puzzle_init_avglvls(&avglvls); + puzzle_init_dvec(context, dvec); + ret = puzzle_getview_from_gdimage(context, &view, gdimage); + if (ret != 0) { + goto out; + } + if (context->puzzle_enable_autocrop != 0 && + (ret = puzzle_autocrop_view(context, &view)) < 0) { + goto out; + } + if ((ret = puzzle_fill_avglgls(context, &avglvls, + &view, context->puzzle_lambdas)) != 0) { + goto out; + } + ret = puzzle_fill_dvec(dvec, &avglvls); + out: + puzzle_free_view(&view); + puzzle_free_avglvls(&avglvls); + + return ret; +} + +int puzzle_fill_dvec_from_file(PuzzleContext * const context, + PuzzleDvec * const dvec, + const char * const file) +{ + int ret; + gdImagePtr gdimage = puzzle_create_gdimage_from_file(file); + if (gdimage == NULL) { + return -1; + } + puzzle_remove_transparency(gdimage); + ret = puzzle_fill_dvec_from_gdimage(context, dvec, gdimage); + gdImageDestroy(gdimage); + return ret; +} + +int puzzle_fill_dvec_from_mem(PuzzleContext * const context, + PuzzleDvec * const dvec, + const void * const mem, + const size_t size) +{ + int ret; + gdImagePtr gdimage = puzzle_create_gdimage_from_mem(mem, size); + if (gdimage == NULL) { + return -1; + } + puzzle_remove_transparency(gdimage); + ret = puzzle_fill_dvec_from_gdimage(context, dvec, gdimage); + gdImageDestroy(gdimage); + return ret; +} + +int puzzle_dump_dvec(PuzzleContext * const context, + const PuzzleDvec * const dvec) +{ + size_t s = dvec->sizeof_compressed_vec; + const double *vecptr = dvec->vec; + + (void) context; + if (s <= (size_t) 0U) { + puzzle_err_bug(__FILE__, __LINE__); + } + do { + printf("%g\n", *vecptr++); + } while (--s != (size_t) 0U); + + return 0; +} diff --git a/deduper/libpuzzle/src/globals.h b/deduper/libpuzzle/src/globals.h new file mode 100644 index 0000000..757c5c7 --- /dev/null +++ b/deduper/libpuzzle/src/globals.h @@ -0,0 +1,26 @@ +#ifndef __GLOBALS_H__ +#define __GLOBALS_H__ 1 + +#ifdef DEFINE_GLOBALS +# define GLOBAL0(A) A +# define GLOBAL(A, B) A = B +#else +# define GLOBAL0(A) extern A +# define GLOBAL(A, B) extern A +#endif + +GLOBAL(PuzzleContext puzzle_global_context, +{ + /* unsigned int puzzle_max_width */ PUZZLE_DEFAULT_MAX_WIDTH _COMA_ + /* unsigned int puzzle_max_height */ PUZZLE_DEFAULT_MAX_HEIGHT _COMA_ + /* unsigned int puzzle_lambdas */ PUZZLE_DEFAULT_LAMBDAS _COMA_ + /* double puzzle_p_ratio */ PUZZLE_DEFAULT_P_RATIO _COMA_ + /* double puzzle_noise_cutoff */ PUZZLE_DEFAULT_NOISE_CUTOFF _COMA_ + /* double puzzle_contrast_barrier_for_cropping */ + PUZZLE_DEFAULT_CONTRAST_BARRIER_FOR_CROPPING _COMA_ + /* double puzzle_max_cropping_ratio */ + PUZZLE_DEFAULT_MAX_CROPPING_RATIO _COMA_ + /* int puzzle_enable_autocrop */ PUZZLE_DEFAULT_ENABLE_AUTOCROP _COMA_ + /* unsigned long magic */ PUZZLE_CONTEXT_MAGIC _COMA_ +}); +#endif diff --git a/deduper/libpuzzle/src/pics/Makefile.am b/deduper/libpuzzle/src/pics/Makefile.am new file mode 100644 index 0000000..510311f --- /dev/null +++ b/deduper/libpuzzle/src/pics/Makefile.am @@ -0,0 +1,8 @@ +EXTRA_DIST = \ + pic-a-0.jpg \ + pic-a-1.jpg \ + luxmarket_tshirt01.jpg \ + luxmarket_tshirt01_black.jpg \ + luxmarket_tshirt01_sal.jpg \ + luxmarket_tshirt01_sheum.jpg \ + duck.gif diff --git a/deduper/libpuzzle/src/pics/duck.gif b/deduper/libpuzzle/src/pics/duck.gif new file mode 100644 index 0000000..96c3037 Binary files /dev/null and b/deduper/libpuzzle/src/pics/duck.gif differ diff --git a/deduper/libpuzzle/src/pics/luxmarket_tshirt01.jpg b/deduper/libpuzzle/src/pics/luxmarket_tshirt01.jpg new file mode 100644 index 0000000..ffaf7eb Binary files /dev/null and b/deduper/libpuzzle/src/pics/luxmarket_tshirt01.jpg differ diff --git a/deduper/libpuzzle/src/pics/luxmarket_tshirt01_black.jpg b/deduper/libpuzzle/src/pics/luxmarket_tshirt01_black.jpg new file mode 100644 index 0000000..73cac7b Binary files /dev/null and b/deduper/libpuzzle/src/pics/luxmarket_tshirt01_black.jpg differ diff --git a/deduper/libpuzzle/src/pics/luxmarket_tshirt01_sal.jpg b/deduper/libpuzzle/src/pics/luxmarket_tshirt01_sal.jpg new file mode 100644 index 0000000..cb0cefe Binary files /dev/null and b/deduper/libpuzzle/src/pics/luxmarket_tshirt01_sal.jpg differ diff --git a/deduper/libpuzzle/src/pics/luxmarket_tshirt01_sheum.jpg b/deduper/libpuzzle/src/pics/luxmarket_tshirt01_sheum.jpg new file mode 100644 index 0000000..185393c Binary files /dev/null and b/deduper/libpuzzle/src/pics/luxmarket_tshirt01_sheum.jpg differ diff --git a/deduper/libpuzzle/src/pics/pic-a-0.jpg b/deduper/libpuzzle/src/pics/pic-a-0.jpg new file mode 100644 index 0000000..3dd4a3b Binary files /dev/null and b/deduper/libpuzzle/src/pics/pic-a-0.jpg differ diff --git a/deduper/libpuzzle/src/pics/pic-a-1.jpg b/deduper/libpuzzle/src/pics/pic-a-1.jpg new file mode 100644 index 0000000..95f0e77 Binary files /dev/null and b/deduper/libpuzzle/src/pics/pic-a-1.jpg differ diff --git a/deduper/libpuzzle/src/puzzle-diff.c b/deduper/libpuzzle/src/puzzle-diff.c new file mode 100644 index 0000000..e0f3626 --- /dev/null +++ b/deduper/libpuzzle/src/puzzle-diff.c @@ -0,0 +1,130 @@ +#include "puzzle_common.h" +#include "puzzle.h" + +typedef struct Opts_ { + const char *file1; + const char *file2; + int fix_for_texts; + int exit; + double similarity_threshold; +} Opts; + +void usage(void) +{ + puts("\nUsage: puzzle-diff [-b ] [-e] [-E similarity threshold] [-h]\n" + " [-H ] [-l ] [-n ]\n" + " [-p

] [-t] [-W ] \n\n" + "Visually compares two images and returns their distance.\n\n" + "-b \n" + "-c : disable autocrop\n" + "-C \n" + "-e : exit with 10 (images are similar) or 20 (images are not)\n" + "-E : for -e\n" + "-h : show help\n" + "-H : set max height\n" + "-l : change lambdas\n" + "-n : change noise cutoff\n" + "-p : set p ratio\n" + "-t disable fix for texts\n" + "-W : set max width\n" + "\n"); + exit(EXIT_SUCCESS); +} + +int parse_opts(Opts * const opts, PuzzleContext * context, + int argc, char * const *argv) { + int opt; + extern char *optarg; + extern int optind; + + opts->fix_for_texts = 1; + opts->exit = 0; + opts->similarity_threshold = PUZZLE_CVEC_SIMILARITY_THRESHOLD; + while ((opt = getopt(argc, argv, "b:cC:eE:hH:l:n:p:tW:")) != -1) { + switch (opt) { + case 'b': + puzzle_set_contrast_barrier_for_cropping(context, atof(optarg)); + break; + case 'c': + puzzle_set_autocrop(context, 0); + break; + case 'C': + puzzle_set_max_cropping_ratio(context, atof(optarg)); + break; + case 'e': + opts->exit = 1; + break; + case 'E': + opts->similarity_threshold = atof(optarg); + break; + case 'h': + usage(); + /* NOTREACHED */ + case 'H': + puzzle_set_max_height(context, strtoul(optarg, NULL, 10)); + break; + case 'l': + puzzle_set_lambdas(context, strtoul(optarg, NULL, 10)); + break; + case 'n': + puzzle_set_noise_cutoff(context, atof(optarg)); + break; + case 'p': + puzzle_set_p_ratio(context, atof(optarg)); + break; + case 't': + opts->fix_for_texts = 0; + break; + case 'W': + puzzle_set_max_width(context, strtoul(optarg, NULL, 10)); + break; + default: + usage(); + /* NOTREACHED */ + } + } + argc -= optind; + argv += optind; + if (argc != 2) { + usage(); + } + opts->file1 = *argv++; + opts->file2 = *argv; + + return 0; +} + +int main(int argc, char *argv[]) +{ + Opts opts; + PuzzleContext context; + PuzzleCvec cvec1, cvec2; + double d; + + puzzle_init_context(&context); + parse_opts(&opts, &context, argc, argv); + puzzle_init_cvec(&context, &cvec1); + puzzle_init_cvec(&context, &cvec2); + if (puzzle_fill_cvec_from_file(&context, &cvec1, opts.file1) != 0) { + fprintf(stderr, "Unable to read [%s]\n", opts.file1); + return 1; + } + if (puzzle_fill_cvec_from_file(&context, &cvec2, opts.file2) != 0) { + fprintf(stderr, "Unable to read [%s]\n", opts.file2); + return 1; + } + d = puzzle_vector_normalized_distance(&context, &cvec1, &cvec2, + opts.fix_for_texts); + puzzle_free_cvec(&context, &cvec1); + puzzle_free_cvec(&context, &cvec2); + puzzle_free_context(&context); + if (opts.exit == 0) { + printf("%g\n", d); + return 0; + } + if (d > opts.similarity_threshold) { + return 20; + } + return 10; +} diff --git a/deduper/libpuzzle/src/puzzle.c b/deduper/libpuzzle/src/puzzle.c new file mode 100644 index 0000000..e21c252 --- /dev/null +++ b/deduper/libpuzzle/src/puzzle.c @@ -0,0 +1,22 @@ +#define DEFINE_GLOBALS 1 +#include "puzzle_common.h" +#include "puzzle_p.h" +#include "puzzle.h" +#include "globals.h" + +void puzzle_init_context(PuzzleContext * const context) +{ + *context = puzzle_global_context; +} + +void puzzle_free_context(PuzzleContext * const context) +{ + (void) context; +} + +void puzzle_err_bug(const char * const file, const int line) +{ + fprintf(stderr, "*BUG* File: [%s] Line: [%d]\n", file, line); + abort(); +} + diff --git a/deduper/libpuzzle/src/puzzle.h b/deduper/libpuzzle/src/puzzle.h new file mode 100644 index 0000000..c31b43f --- /dev/null +++ b/deduper/libpuzzle/src/puzzle.h @@ -0,0 +1,122 @@ +#ifndef __PUZZLE_H__ +#define __PUZZLE_H__ 1 + +#define PUZZLE_VERSION_MAJOR 0 +#define PUZZLE_VERSION_MINOR 11 + +#include "puzzle_common.h" + +typedef struct PuzzleDvec_ { + size_t sizeof_vec; + size_t sizeof_compressed_vec; + double *vec; +} PuzzleDvec; + +typedef struct PuzzleCvec_ { + size_t sizeof_vec; + signed char *vec; +} PuzzleCvec; + +typedef struct PuzzleCompressedCvec_ { + size_t sizeof_compressed_vec; + unsigned char *vec; +} PuzzleCompressedCvec; + +typedef struct PuzzleContext_ { + unsigned int puzzle_max_width; + unsigned int puzzle_max_height; + unsigned int puzzle_lambdas; + double puzzle_p_ratio; + double puzzle_noise_cutoff; + double puzzle_contrast_barrier_for_cropping; + double puzzle_max_cropping_ratio; + int puzzle_enable_autocrop; + unsigned long magic; +} PuzzleContext; + +#ifdef __cplusplus +extern "C" { +#endif +void puzzle_init_context(PuzzleContext * const context); +void puzzle_free_context(PuzzleContext * const context); +int puzzle_set_max_width(PuzzleContext * const context, + const unsigned int width); +int puzzle_set_max_height(PuzzleContext * const context, + const unsigned int height); +int puzzle_set_lambdas(PuzzleContext * const context, + const unsigned int lambdas); +int puzzle_set_noise_cutoff(PuzzleContext * const context, + const double noise_cutoff); +int puzzle_set_p_ratio(PuzzleContext * const context, + const double p_ratio); +int puzzle_set_contrast_barrier_for_cropping(PuzzleContext * const context, + const double barrier); +int puzzle_set_max_cropping_ratio(PuzzleContext * const context, + const double ratio); +int puzzle_set_autocrop(PuzzleContext * const context, + const int enable); +void puzzle_init_cvec(PuzzleContext * const context, + PuzzleCvec * const cvec); +void puzzle_init_dvec(PuzzleContext * const context, + PuzzleDvec * const dvec); +int puzzle_fill_dvec_from_file(PuzzleContext * const context, + PuzzleDvec * const dvec, + const char * const file); +int puzzle_fill_cvec_from_file(PuzzleContext * const context, + PuzzleCvec * const cvec, + const char * const file); +int puzzle_fill_dvec_from_mem(PuzzleContext * const context, + PuzzleDvec * const dvec, + const void * const mem, + const size_t size); +int puzzle_fill_cvec_from_mem(PuzzleContext * const context, + PuzzleCvec * const cvec, + const void * const mem, + const size_t size); +int puzzle_fill_cvec_from_dvec(PuzzleContext * const context, + PuzzleCvec * const cvec, + const PuzzleDvec * const dvec); +void puzzle_free_cvec(PuzzleContext * const context, + PuzzleCvec * const cvec); +void puzzle_free_dvec(PuzzleContext * const context, + PuzzleDvec * const dvec); +int puzzle_dump_cvec(PuzzleContext * const context, + const PuzzleCvec * const cvec); +int puzzle_dump_dvec(PuzzleContext * const context, + const PuzzleDvec * const dvec); +int puzzle_cvec_cksum(PuzzleContext * const context, + const PuzzleCvec * const cvec, unsigned int * const sum); +void puzzle_init_compressed_cvec(PuzzleContext * const context, + PuzzleCompressedCvec * const compressed_cvec); +void puzzle_free_compressed_cvec(PuzzleContext * const context, + PuzzleCompressedCvec * const compressed_cvec); +int puzzle_compress_cvec(PuzzleContext * const context, + PuzzleCompressedCvec * const compressed_cvec, + const PuzzleCvec * const cvec); +int puzzle_uncompress_cvec(PuzzleContext * const context, + const PuzzleCompressedCvec * const compressed_cvec, + PuzzleCvec * const cvec); +int puzzle_vector_sub(PuzzleContext * const context, + PuzzleCvec * const cvecr, + const PuzzleCvec * const cvec1, + const PuzzleCvec * const cvec2, + const int fix_for_texts); +double puzzle_vector_euclidean_length(PuzzleContext * const context, + const PuzzleCvec * const cvec); +double puzzle_vector_normalized_distance(PuzzleContext * const context, + const PuzzleCvec * const cvec1, + const PuzzleCvec * const cvec2, + const int fix_for_texts); + +#ifdef __cplusplus +} +#endif + +#define PUZZLE_CVEC_SIMILARITY_THRESHOLD 0.6 +#define PUZZLE_CVEC_SIMILARITY_HIGH_THRESHOLD 0.7 +#define PUZZLE_CVEC_SIMILARITY_LOW_THRESHOLD 0.3 +#define PUZZLE_CVEC_SIMILARITY_LOWER_THRESHOLD 0.2 + +#define _COMA_ , + +#endif diff --git a/deduper/libpuzzle/src/puzzle_common.h b/deduper/libpuzzle/src/puzzle_common.h new file mode 100644 index 0000000..ebd340b --- /dev/null +++ b/deduper/libpuzzle/src/puzzle_common.h @@ -0,0 +1,18 @@ +#ifndef __PUZZLE_COMMON_H__ +#define __PUZZLE_COMMON_H__ 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef errno +extern int errno; +#endif + +#endif diff --git a/deduper/libpuzzle/src/puzzle_p.h b/deduper/libpuzzle/src/puzzle_p.h new file mode 100644 index 0000000..2f09494 --- /dev/null +++ b/deduper/libpuzzle/src/puzzle_p.h @@ -0,0 +1,67 @@ +#ifndef __PUZZLE_P_H__ +#define __PUZZLE_P_H__ 1 + +#include +#include + +typedef struct PuzzleView_ { + unsigned int width; + unsigned int height; + size_t sizeof_map; + unsigned char *map; +} PuzzleView; + +typedef struct PuzzleAvgLvls_ { + unsigned int lambdas; + size_t sizeof_lvls; + double *lvls; +} PuzzleAvgLvls; + +typedef enum PuzzleImageTypeCode_ { + PUZZLE_IMAGE_TYPE_ERROR, PUZZLE_IMAGE_TYPE_UNKNOWN, PUZZLE_IMAGE_TYPE_JPEG, + PUZZLE_IMAGE_TYPE_GIF, PUZZLE_IMAGE_TYPE_PNG +} PuzzleImageTypeCode; + +typedef struct PuzzleImageType_ { + const size_t sizeof_signature; + const unsigned char *signature; + const PuzzleImageTypeCode image_type_code; +} PuzzleImageType; + +#ifndef SIZE_MAX +# define SIZE_MAX ((size_t) -1) +#endif + +#define PUZZLE_DEFAULT_LAMBDAS 9 +#define PUZZLE_DEFAULT_MAX_WIDTH 3000 +#define PUZZLE_DEFAULT_MAX_HEIGHT 3000 +#define PUZZLE_DEFAULT_NOISE_CUTOFF 2.0 +#define PUZZLE_DEFAULT_P_RATIO 2.0 +#define PUZZLE_MIN_P 2 +#define PUZZLE_PIXEL_FUZZ_SIZE 1 +#define PUZZLE_NEIGHBORS 8 +#define PUZZLE_MIN_SIZE_FOR_CROPPING 100 +#if PUZZLE_MIN_SIZE_FOR_CROPPING < 4 +# error PUZZLE_MIN_SIZE_FOR_CROPPING +#endif +#define PUZZLE_DEFAULT_CONTRAST_BARRIER_FOR_CROPPING 0.05 +#define PUZZLE_DEFAULT_MAX_CROPPING_RATIO 0.25 +#define PUZZLE_DEFAULT_ENABLE_AUTOCROP 1 + +#define PUZZLE_VIEW_PIXEL(V, X, Y) (*((V)->map + (V)->width * (Y) + (X))) +#define PUZZLE_AVGLVL(A, X, Y) (*((A)->lvls + (A)->lambdas * (Y) + (X))) + +#define PUZZLE_CONTEXT_MAGIC 0xdeadbeef + +#ifndef MIN +# define MIN(A, B) ((A) < (B) ? (A) : (B)) +#endif +#ifndef MAX +# define MAX(A, B) ((A) > (B) ? (A) : (B)) +#endif +#define SUCC(A) ((A) + 1) +#define PRED(A) ((A) - 1) + +void puzzle_err_bug(const char * const file, const int line); + +#endif diff --git a/deduper/libpuzzle/src/regress_1.c b/deduper/libpuzzle/src/regress_1.c new file mode 100644 index 0000000..80462b8 --- /dev/null +++ b/deduper/libpuzzle/src/regress_1.c @@ -0,0 +1,32 @@ +#include "puzzle_common.h" +#include "puzzle.h" + +#define EXPECTED_RESULT 111444570 + +int main(void) +{ + PuzzleContext context; + PuzzleCvec cvec; + PuzzleCompressedCvec compressed_cvec; + unsigned int sum; + + puzzle_init_context(&context); + puzzle_init_compressed_cvec(&context, &compressed_cvec); + puzzle_init_cvec(&context, &cvec); + if (puzzle_fill_cvec_from_file(&context, &cvec, + "pics/luxmarket_tshirt01.jpg") != 0) { + fprintf(stderr, "File not found\n"); + exit(0); + } + puzzle_compress_cvec(&context, &compressed_cvec, &cvec); + puzzle_free_cvec(&context, &cvec); + puzzle_init_cvec(&context, &cvec); + puzzle_uncompress_cvec(&context, &compressed_cvec, &cvec); + puzzle_cvec_cksum(&context, &cvec, &sum); + puzzle_free_cvec(&context, &cvec); + puzzle_free_compressed_cvec(&context, &compressed_cvec); + puzzle_free_context(&context); + printf("%u %u\n", sum, (unsigned int) EXPECTED_RESULT); + + return sum != EXPECTED_RESULT; +} diff --git a/deduper/libpuzzle/src/regress_2.c b/deduper/libpuzzle/src/regress_2.c new file mode 100644 index 0000000..a37b626 --- /dev/null +++ b/deduper/libpuzzle/src/regress_2.c @@ -0,0 +1,72 @@ +#include "puzzle_common.h" +#include "puzzle.h" + +int main(void) +{ + PuzzleContext context; + PuzzleCvec cvec1, cvec2, cvec3, cvec4, cvec5, cvec6; + double d1, d2, d3, d4, d5, d6; + + puzzle_init_context(&context); + puzzle_init_cvec(&context, &cvec1); + puzzle_init_cvec(&context, &cvec2); + puzzle_init_cvec(&context, &cvec3); + puzzle_init_cvec(&context, &cvec4); + puzzle_init_cvec(&context, &cvec5); + puzzle_init_cvec(&context, &cvec6); + if (puzzle_fill_cvec_from_file + (&context, &cvec1, "pics/luxmarket_tshirt01.jpg") != 0) { + fprintf(stderr, "File 1 not found\n"); + exit(0); + } + if (puzzle_fill_cvec_from_file + (&context, &cvec2, "pics/luxmarket_tshirt01_black.jpg") != 0) { + fprintf(stderr, "File 2 not found\n"); + exit(0); + } + if (puzzle_fill_cvec_from_file + (&context, &cvec3, "pics/luxmarket_tshirt01_sal.jpg") != 0) { + fprintf(stderr, "File 3 not found\n"); + exit(0); + } + if (puzzle_fill_cvec_from_file + (&context, &cvec4, "pics/luxmarket_tshirt01_sheum.jpg") != 0) { + fprintf(stderr, "File 4 not found\n"); + exit(0); + } + if (puzzle_fill_cvec_from_file + (&context, &cvec5, "pics/duck.gif") != 0) { + fprintf(stderr, "File 5 not found\n"); + exit(0); + } + if (puzzle_fill_cvec_from_file + (&context, &cvec6, "pics/pic-a-0.jpg") != 0) { + fprintf(stderr, "File 6 not found\n"); + exit(0); + } + d1 = puzzle_vector_normalized_distance(&context, &cvec2, &cvec1, 1); + d2 = puzzle_vector_normalized_distance(&context, &cvec1, &cvec2, 1); + d3 = puzzle_vector_normalized_distance(&context, &cvec1, &cvec3, 1); + d4 = puzzle_vector_normalized_distance(&context, &cvec1, &cvec4, 1); + d5 = puzzle_vector_normalized_distance(&context, &cvec1, &cvec5, 1); + d6 = puzzle_vector_normalized_distance(&context, &cvec1, &cvec6, 1); + printf("%g %g %g %g %g %g\n", d1, d2, d3, d4, d5, d6); + puzzle_free_cvec(&context, &cvec1); + puzzle_free_cvec(&context, &cvec2); + puzzle_free_cvec(&context, &cvec3); + puzzle_free_cvec(&context, &cvec4); + puzzle_free_cvec(&context, &cvec5); + puzzle_free_cvec(&context, &cvec6); + puzzle_free_context(&context); + if ((int) (d1 * 100.0) != (int) (d2 * 100.0)) { + return 1; + } + if (d1 > PUZZLE_CVEC_SIMILARITY_THRESHOLD || + d3 > PUZZLE_CVEC_SIMILARITY_THRESHOLD || + d4 > PUZZLE_CVEC_SIMILARITY_THRESHOLD || + d5 < PUZZLE_CVEC_SIMILARITY_THRESHOLD || + d6 < PUZZLE_CVEC_SIMILARITY_THRESHOLD) { + return 2; + } + return 0; +} diff --git a/deduper/libpuzzle/src/regress_3.c b/deduper/libpuzzle/src/regress_3.c new file mode 100644 index 0000000..33698ba --- /dev/null +++ b/deduper/libpuzzle/src/regress_3.c @@ -0,0 +1,35 @@ +#include "puzzle_common.h" +#include "puzzle.h" + +#define PUZZLE_VECTOR_SLICE 0.6 + +int main(void) +{ + PuzzleContext context; + PuzzleCvec cvec1, cvec2; + double d1, d2; + + puzzle_init_context(&context); + puzzle_init_cvec(&context, &cvec1); + puzzle_init_cvec(&context, &cvec2); + if (puzzle_fill_cvec_from_file(&context, &cvec1, + "pics/pic-a-0.jpg") != 0) { + fprintf(stderr, "File 1 not found\n"); + exit(0); + } + if (puzzle_fill_cvec_from_file(&context, &cvec2, + "pics/pic-a-1.jpg") != 0) { + fprintf(stderr, "File 2 not found\n"); + exit(0); + } + d1 = puzzle_vector_normalized_distance(&context, &cvec1, &cvec2, 1); + d2 = puzzle_vector_normalized_distance(&context, &cvec1, &cvec2, 0); + printf("%g %g\n", d1, d2); + puzzle_free_cvec(&context, &cvec1); + puzzle_free_cvec(&context, &cvec2); + puzzle_free_context(&context); + if (d1 > PUZZLE_VECTOR_SLICE || d2 > PUZZLE_VECTOR_SLICE) { + return 2; + } + return 0; +} diff --git a/deduper/libpuzzle/src/tunables.c b/deduper/libpuzzle/src/tunables.c new file mode 100644 index 0000000..280dfb2 --- /dev/null +++ b/deduper/libpuzzle/src/tunables.c @@ -0,0 +1,84 @@ +#include "puzzle_common.h" +#include "puzzle_p.h" +#include "puzzle.h" +#include "globals.h" + +int puzzle_set_max_width(PuzzleContext * const context, + const unsigned int width) +{ + if (width <= 0U) { + return -1; + } + context->puzzle_max_width = width; + + return 0; +} + +int puzzle_set_max_height(PuzzleContext * const context, + const unsigned int height) +{ + if (height <= 0U) { + return -1; + } + context->puzzle_max_height = height; + + return 0; +} + +int puzzle_set_lambdas(PuzzleContext * const context, + const unsigned int lambdas) +{ + if (lambdas <= 0U) { + return -1; + } + context->puzzle_lambdas = lambdas; + + return 0; +} + +int puzzle_set_p_ratio(PuzzleContext * const context, const double p_ratio) +{ + if (p_ratio < 1.0) { + return -1; + } + context->puzzle_p_ratio = p_ratio; + + return 0; +} + +int puzzle_set_noise_cutoff(PuzzleContext * const context, + const double noise_cutoff) +{ + context->puzzle_noise_cutoff = noise_cutoff; + + return 0; +} + +int puzzle_set_contrast_barrier_for_cropping(PuzzleContext * const context, + const double barrier) +{ + if (barrier <= 0.0) { + return -1; + } + context->puzzle_contrast_barrier_for_cropping = barrier; + + return 0; +} + +int puzzle_set_max_cropping_ratio(PuzzleContext * const context, + const double ratio) +{ + if (ratio <= 0.0) { + return -1; + } + context->puzzle_max_cropping_ratio = ratio; + + return 0; +} + +int puzzle_set_autocrop(PuzzleContext * const context, const int enable) +{ + context->puzzle_enable_autocrop = (enable != 0); + + return 0; +} diff --git a/deduper/libpuzzle/src/vector_ops.c b/deduper/libpuzzle/src/vector_ops.c new file mode 100644 index 0000000..4fad5bf --- /dev/null +++ b/deduper/libpuzzle/src/vector_ops.c @@ -0,0 +1,95 @@ +#include "puzzle_common.h" +#include "puzzle_p.h" +#include "puzzle.h" +#include "globals.h" + +int puzzle_vector_sub(PuzzleContext * const context, + PuzzleCvec * const cvecr, + const PuzzleCvec * const cvec1, + const PuzzleCvec * const cvec2, + const int fix_for_texts) +{ + size_t remaining; + signed char c1, c2, cr; + + (void) context; + if (cvec1->sizeof_vec != cvec2->sizeof_vec || + cvec1->sizeof_vec <= (size_t) 0U) { + puzzle_err_bug(__FILE__, __LINE__); + } + if (cvecr->vec != NULL) { + puzzle_err_bug(__FILE__, __LINE__); + } + cvecr->sizeof_vec = cvec1->sizeof_vec; + if ((cvecr->vec = calloc(cvecr->sizeof_vec, sizeof *cvecr->vec)) == NULL) { + return -1; + } + remaining = cvec1->sizeof_vec; + if (fix_for_texts != 0) { + do { + remaining--; + c1 = cvec1->vec[remaining]; + c2 = cvec2->vec[remaining]; + if ((c1 == 0 && c2 == -2) || (c1 == -2 && c2 == 0)) { + cr = -3; + } else if ((c1 == 0 && c2 == +2) || (c1 == +2 && c2 == 0)) { + cr = +3; + } else { + cr = c1 - c2; + } + cvecr->vec[remaining] = cr; + } while (remaining > (size_t) 0U); + } else { + do { + remaining--; + cvecr->vec[remaining] = + cvec1->vec[remaining] - cvec2->vec[remaining]; + } while (remaining > (size_t) 0U); + } + return 0; +} + +double puzzle_vector_euclidean_length(PuzzleContext * const context, + const PuzzleCvec * const cvec) +{ + unsigned long t = 0U; + unsigned long c; + int c2; + size_t remaining; + + (void) context; + if ((remaining = cvec->sizeof_vec) <= (size_t) 0U) { + return 0.0; + } + do { + remaining--; + c2 = (int) cvec->vec[remaining]; + c = (unsigned long) (c2 * c2); + if (ULONG_MAX - t < c) { + puzzle_err_bug(__FILE__, __LINE__); + } + t += c; + } while (remaining > (size_t) 0U); + + return sqrt((double) t); +} + +double puzzle_vector_normalized_distance(PuzzleContext * const context, + const PuzzleCvec * const cvec1, + const PuzzleCvec * const cvec2, + const int fix_for_texts) +{ + PuzzleCvec cvecr; + double dt, dr; + + puzzle_init_cvec(context, &cvecr); + puzzle_vector_sub(context, &cvecr, cvec1, cvec2, fix_for_texts); + dt = puzzle_vector_euclidean_length(context, &cvecr); + puzzle_free_cvec(context, &cvecr); + dr = puzzle_vector_euclidean_length(context, cvec1) + + puzzle_vector_euclidean_length(context, cvec2); + if (dr == 0.0) { + return 0.0; + } + return dt / dr; +} diff --git a/deduper/thread_pool.h b/deduper/thread_pool.h new file mode 100644 index 0000000..ee661ce --- /dev/null +++ b/deduper/thread_pool.h @@ -0,0 +1,127 @@ +#ifndef THREAD_POOL_H +#define THREAD_POOL_H + +#include +#include +#include +#include +#include +#include +#include +#include + +template +class _atomic_queue +{ +public: + void push(T&v) + { + std::unique_lock lck(mtx); + q.push(v); + } + bool pop(T&v) + { + std::unique_lock lck(mtx); + if(!q.empty()) + { + v=std::move(q.front()); + q.pop(); + return true; + } + return false; + } + size_t size() + { + std::unique_lock lck(mtx); + return q.size(); + } +private: + std::queue q; + std::mutex mtx; +}; + +class thread_pool +{ +public: + thread_pool(size_t njobs):waiting_threads(0),stop(false),wait_interrupt(false) + { + thr.resize(njobs); + thstop.resize(njobs); + for(size_t i=0;i>(false); + auto looper=[this,i,cstop]{ + std::atomic&stop=*cstop; + std::function *f; + bool popped=wq.pop(f); + while(1) + { + for(;popped;popped=wq.pop(f)) + { + std::unique_ptr> pf(f); + (*f)(i); + if(stop)return; + } + std::unique_lock lck(mtx); + ++waiting_threads; + cv.wait(lck,[this,&f,&popped,&stop]{ + popped=wq.pop(f); + return popped||wait_interrupt||stop; + }); + --waiting_threads; + if(!popped)return; + } + }; + thr[i].reset(new std::thread(looper)); + } + } + template + auto create_task(F&&f,A&&...args)->std::future + { + auto task=std::make_shared>( + std::bind(std::forward(f),std::placeholders::_1,std::forward(args)...) + ); + auto worktask=new std::function([task](int id){(*task)(id);}); + wq.push(worktask); + std::unique_lock lck(mtx); + cv.notify_one(); + return task->get_future(); + } + void wait() + { + if(!stop)wait_interrupt=true; + { + std::unique_lock lck(mtx); + cv.notify_all(); + } + for(size_t i=0;ijoinable())thr[i]->join(); + std::function *f; + while(wq.size()){wq.pop(f);delete f;} + thr.clear();thstop.clear(); + } + void terminate() + { + stop=true; + std::function *f; + while(wq.size()){wq.pop(f);delete f;} + for(size_t i=0;i lck(mtx); + cv.notify_all(); + } + for(size_t i=0;ijoinable())thr[i]->join(); + while(wq.size()){wq.pop(f);delete f;} + thr.clear();thstop.clear(); + } +private: + std::vector> thr; + std::vector>> thstop; + _atomic_queue*> wq; + std::atomic wait_interrupt; + std::atomic stop; + std::atomic waiting_threads; + std::mutex mtx; + std::condition_variable cv; +}; + +#endif -- cgit v1.2.3