aboutsummaryrefslogtreecommitdiff
path: root/deduper
diff options
context:
space:
mode:
authorGravatar Chris Xiong <chirs241097@gmail.com> 2020-04-06 00:50:58 +0800
committerGravatar Chris Xiong <chirs241097@gmail.com> 2020-04-06 00:50:58 +0800
commited47c1557915bb2472f6959e723cd76155312a98 (patch)
tree85bc451630ebaa4f5ffce3043b4cbf948a912a66 /deduper
parent0a094f28c2e2ebfaac91398ae62e40f00f09221b (diff)
downloadoddities-ed47c1557915bb2472f6959e723cd76155312a98.tar.xz
Add deduper (unfinished tool for finding image duplicates).
Diffstat (limited to 'deduper')
-rw-r--r--deduper/CMakeLists.txt20
-rw-r--r--deduper/deduper.cpp195
-rw-r--r--deduper/libpuzzle/AUTHORS1
-rw-r--r--deduper/libpuzzle/COPYING17
-rw-r--r--deduper/libpuzzle/ChangeLog0
-rw-r--r--deduper/libpuzzle/Makefile.am11
-rw-r--r--deduper/libpuzzle/NEWS0
-rw-r--r--deduper/libpuzzle/README202
-rw-r--r--deduper/libpuzzle/README-PHP76
-rw-r--r--deduper/libpuzzle/THANKS6
-rwxr-xr-xdeduper/libpuzzle/autogen.sh17
-rw-r--r--deduper/libpuzzle/composer.json10
-rw-r--r--deduper/libpuzzle/configure.ac70
-rw-r--r--deduper/libpuzzle/man/Makefile.am7
-rw-r--r--deduper/libpuzzle/man/libpuzzle.3296
-rw-r--r--deduper/libpuzzle/man/puzzle-diff.858
-rw-r--r--deduper/libpuzzle/man/puzzle_set.3129
-rw-r--r--deduper/libpuzzle/php/Makefile.am3
-rw-r--r--deduper/libpuzzle/php/examples/Makefile.am2
-rw-r--r--deduper/libpuzzle/php/examples/similar/Makefile.am6
-rw-r--r--deduper/libpuzzle/php/examples/similar/config.inc.php9
-rw-r--r--deduper/libpuzzle/php/examples/similar/schema.pgsql.sql230
-rw-r--r--deduper/libpuzzle/php/examples/similar/schema.sqlite3.sql23
-rw-r--r--deduper/libpuzzle/php/examples/similar/similar.inc.php120
-rw-r--r--deduper/libpuzzle/php/examples/similar/similar.php158
-rw-r--r--deduper/libpuzzle/php/libpuzzle/CREDITS1
-rw-r--r--deduper/libpuzzle/php/libpuzzle/EXPERIMENTAL0
-rw-r--r--deduper/libpuzzle/php/libpuzzle/LICENSE15
-rw-r--r--deduper/libpuzzle/php/libpuzzle/Makefile.am15
-rw-r--r--deduper/libpuzzle/php/libpuzzle/README4
-rw-r--r--deduper/libpuzzle/php/libpuzzle/build/Makefile.am0
-rw-r--r--deduper/libpuzzle/php/libpuzzle/config.m449
-rw-r--r--deduper/libpuzzle/php/libpuzzle/include/Makefile.am0
-rw-r--r--deduper/libpuzzle/php/libpuzzle/libpuzzle.c410
-rw-r--r--deduper/libpuzzle/php/libpuzzle/libpuzzle.php21
-rw-r--r--deduper/libpuzzle/php/libpuzzle/modules/Makefile.am0
-rw-r--r--deduper/libpuzzle/php/libpuzzle/php_libpuzzle.h66
-rw-r--r--deduper/libpuzzle/php/libpuzzle/tests/001.phpt10
-rw-r--r--deduper/libpuzzle/php/libpuzzle/tests/002.phpt15
-rw-r--r--deduper/libpuzzle/php/libpuzzle/tests/003.phpt24
-rw-r--r--deduper/libpuzzle/php/libpuzzle/tests/Makefile.am7
-rw-r--r--deduper/libpuzzle/php/libpuzzle/tests/pics/Makefile.am3
-rw-r--r--deduper/libpuzzle/php/libpuzzle/tests/pics/pic-a-0.jpgbin0 -> 13946 bytes
-rw-r--r--deduper/libpuzzle/php/libpuzzle/tests/pics/pic-a-1.jpgbin0 -> 27407 bytes
-rw-r--r--deduper/libpuzzle/src/CMakeLists.txt21
-rw-r--r--deduper/libpuzzle/src/Makefile.am72
-rw-r--r--deduper/libpuzzle/src/compress.c125
-rw-r--r--deduper/libpuzzle/src/cvec.c202
-rw-r--r--deduper/libpuzzle/src/dvec.c663
-rw-r--r--deduper/libpuzzle/src/globals.h26
-rw-r--r--deduper/libpuzzle/src/pics/Makefile.am8
-rw-r--r--deduper/libpuzzle/src/pics/duck.gifbin0 -> 7196 bytes
-rw-r--r--deduper/libpuzzle/src/pics/luxmarket_tshirt01.jpgbin0 -> 41128 bytes
-rw-r--r--deduper/libpuzzle/src/pics/luxmarket_tshirt01_black.jpgbin0 -> 19800 bytes
-rw-r--r--deduper/libpuzzle/src/pics/luxmarket_tshirt01_sal.jpgbin0 -> 24646 bytes
-rw-r--r--deduper/libpuzzle/src/pics/luxmarket_tshirt01_sheum.jpgbin0 -> 16128 bytes
-rw-r--r--deduper/libpuzzle/src/pics/pic-a-0.jpgbin0 -> 13946 bytes
-rw-r--r--deduper/libpuzzle/src/pics/pic-a-1.jpgbin0 -> 27407 bytes
-rw-r--r--deduper/libpuzzle/src/puzzle-diff.c130
-rw-r--r--deduper/libpuzzle/src/puzzle.c22
-rw-r--r--deduper/libpuzzle/src/puzzle.h122
-rw-r--r--deduper/libpuzzle/src/puzzle_common.h18
-rw-r--r--deduper/libpuzzle/src/puzzle_p.h67
-rw-r--r--deduper/libpuzzle/src/regress_1.c32
-rw-r--r--deduper/libpuzzle/src/regress_2.c72
-rw-r--r--deduper/libpuzzle/src/regress_3.c35
-rw-r--r--deduper/libpuzzle/src/tunables.c84
-rw-r--r--deduper/libpuzzle/src/vector_ops.c95
-rw-r--r--deduper/thread_pool.h127
69 files changed, 4227 insertions, 0 deletions
diff --git a/deduper/CMakeLists.txt b/deduper/CMakeLists.txt
new file mode 100644
index 0000000..ac0859d
--- /dev/null
+++ b/deduper/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 3.11.0)
+project(deduper C CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+include(FindPkgConfig)
+pkg_search_module(gdlib REQUIRED gdlib)
+find_package(Threads REQUIRED)
+
+add_subdirectory(libpuzzle/src)
+add_executable(deduper deduper.cpp thread_pool.h)
+target_link_directories(deduper
+ PRIVATE
+ ${gdlib_LIBRARY_DIRS}
+)
+target_link_libraries(deduper
+ puzzle
+ ${gdlib_LIBRARIES}
+ ${CMAKE_THREAD_LIBS_INIT}
+)
diff --git a/deduper/deduper.cpp b/deduper/deduper.cpp
new file mode 100644
index 0000000..8f6e2f4
--- /dev/null
+++ b/deduper/deduper.cpp
@@ -0,0 +1,195 @@
+#include "libpuzzle/src/puzzle.h"
+
+#include <cstdio>
+#include <cstring>
+
+#include <filesystem>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <getopt.h>
+
+#include "thread_pool.h"
+
+PuzzleContext pzctx;
+int ctr;
+int recursive;
+int njobs=1;
+double threshold=0.3;
+std::vector<std::string> paths;
+
+int parse_arguments(int argc,char **argv)
+{
+ recursive=0;
+ int help=0;
+ option longopt[]=
+ {
+ {"recursive",no_argument ,&recursive,1},
+// {"destdir" ,required_argument,0 ,'D'},
+ {"jobs" ,required_argument,0 ,'j'},
+ {"threshold",required_argument,0 ,'d'},
+ {"help" ,no_argument ,&help ,1},
+ {0 ,0 ,0 ,0}
+ };
+ while(1)
+ {
+ int idx=0;
+ int c=getopt_long(argc,argv,"rhj:d:",longopt,&idx);
+ if(!~c)break;
+ switch(c)
+ {
+ case 0:
+ if(longopt[idx].flag)break;
+ if(std::string("jobs")==longopt[idx].name)
+ sscanf(optarg,"%d",&njobs);
+ if(std::string("threshold")==longopt[idx].name)
+ sscanf(optarg,"%lf",&threshold);
+ break;
+ case 'r':
+ recursive=1;
+ break;
+ case 'h':
+ help=1;
+ break;
+ case 'j':
+ sscanf(optarg,"%d",&njobs);
+ break;
+ case 'd':
+ sscanf(optarg,"%lf",&threshold);
+ break;
+ }
+ }
+ for(;optind<argc;++optind)
+ paths.push_back(argv[optind]);
+ if(help||argc<2)
+ {
+ printf(
+ "Usage: %s [OPTION] PATH...\n"
+ "Detect potentially duplicate images in PATHs and optionally perform an action on them.\n\n"
+ " -h, --help Display this help message and exit.\n"
+ " -r, --recursive Recurse into all directories.\n"
+ " -j, --jobs Number of concurrent tasks to run at once.\n"
+ " -d, --threshold Threshold distance below which images will be considered similar.\n"
+ ,argv[0]
+ );
+ return 1;
+ }
+ if(threshold>1||threshold<0)
+ {
+ puts("Invalid threshold value.");
+ return 2;
+ }
+ if(threshold<1e-6)threshold=1e-6;
+ if(!paths.size())
+ {
+ puts("Missing image path.");
+ return 2;
+ }
+ return 0;
+}
+
+void build_file_list(std::filesystem::path path,bool recursive,std::vector<std::string>&out)
+{
+ if(recursive)
+ {
+ auto dirit=std::filesystem::recursive_directory_iterator(path);
+ for(auto &p:dirit)
+ {
+ FILE* fp=fopen(p.path().c_str(),"r");
+ char c[8];
+ fread((void*)c,1,6,fp);
+ if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)||!memcmp(c,"GIF87a",6)||!memcmp(c,"GIF89a",6))
+ out.push_back(p.path().string());
+ fclose(fp);
+ }
+ }
+ else
+ {
+ auto dirit=std::filesystem::directory_iterator(path);
+ for(auto &p:dirit)
+ {
+ FILE* fp=fopen(p.path().c_str(),"r");
+ char c[8];
+ fread((void*)c,1,6,fp);
+ if(!memcmp(c,"\x89PNG\r\n",6)||!memcmp(c,"\xff\xd8\xff",3)||!memcmp(c,"GIF87a",6)||!memcmp(c,"GIF89a",6))
+ out.push_back(p.path().string());
+ fclose(fp);
+ }
+ }
+}
+
+void compute_signature_vectors(const std::vector<std::string>&files,std::vector<PuzzleCvec>&output)
+{
+ thread_pool tp(njobs);
+ for(size_t i=0;i<files.size();++i)
+ {
+ puzzle_init_cvec(&pzctx,&output[i]);
+ auto job_func=[&](int thid,size_t id){
+ fprintf(stderr,"spawned: on thread#%d, file#%lu\n",thid,id);
+ puzzle_fill_cvec_from_file(&pzctx,&output[id],files[id].c_str());
+ fprintf(stderr,"done: file#%lu\n",id);
+ printf("%d/%lu\r",++ctr,files.size());
+ fflush(stdout);
+ };
+ tp.create_task(job_func,i);
+ }
+ tp.wait();
+}
+
+void compare_signature_vectors(const std::vector<PuzzleCvec>&vec,std::vector<std::tuple<size_t,size_t,double>>&out)
+{
+ thread_pool tp(njobs);
+ for(size_t i=0;i<vec.size();++i)
+ for(size_t j=i+1;j<vec.size();++j)
+ {
+ auto job_func=[&](int thid,size_t ida,size_t idb){
+ fprintf(stderr,"spawned: on thread#%d, file#%lu<->file#%lu\n",thid,ida,idb);
+ if(vec[ida].sizeof_vec&&vec[idb].sizeof_vec)
+ {
+ double d=puzzle_vector_normalized_distance(&pzctx,&vec[ida],&vec[idb],1);
+ if(d<threshold)out.emplace_back(ida,idb,d);
+ fprintf(stderr,"done:file#%lu<->file#%lu: %lf\n",ida,idb,d);
+ }
+ printf("%d/%lu\r",++ctr,vec.size()*(vec.size()-1)/2);
+ fflush(stdout);
+ };
+ tp.create_task(job_func,i,j);
+ }
+ tp.wait();
+}
+
+int main(int argc,char** argv)
+{
+ if(int pr=parse_arguments(argc,argv))return pr-1;
+ puts("building list of files to compare...");
+ std::vector<std::string> x;
+ for(auto&p:paths)
+ build_file_list(p,recursive,x);
+ printf("%lu files to compare.\n",x.size());
+ puts("computing signature vectors...");
+ puzzle_init_context(&pzctx);
+ std::vector<PuzzleCvec> cvecs;
+ cvecs.resize(x.size());
+ compute_signature_vectors(x,cvecs);
+ for(auto &v:cvecs)
+ {
+ fprintf(stderr,"%lu:",v.sizeof_vec);
+ for(size_t i=0;i<v.sizeof_vec;++i)
+ fprintf(stderr," %d",v.vec[i]);
+ fprintf(stderr,"\n");
+ }
+ ctr=0;
+ puts("\ncomparing signature vectors...");
+ std::vector<std::tuple<size_t,size_t,double>> r;
+ compare_signature_vectors(cvecs,r);
+ puts("");
+ for(auto &t:r)
+ printf("%s<->%s: %lf\n",x[std::get<0>(t)].c_str(),x[std::get<1>(t)].c_str(),std::get<2>(t));
+ printf("%lu similar images.",r.size());
+ for(auto &v:cvecs)puzzle_free_cvec(&pzctx,&v);
+ cvecs.clear();
+ puzzle_free_context(&pzctx);
+ return 0;
+}
diff --git a/deduper/libpuzzle/AUTHORS b/deduper/libpuzzle/AUTHORS
new file mode 100644
index 0000000..bb6ecb3
--- /dev/null
+++ b/deduper/libpuzzle/AUTHORS
@@ -0,0 +1 @@
+Frank DENIS <j at pureftpd.org>
diff --git a/deduper/libpuzzle/COPYING b/deduper/libpuzzle/COPYING
new file mode 100644
index 0000000..30877ad
--- /dev/null
+++ b/deduper/libpuzzle/COPYING
@@ -0,0 +1,17 @@
+/*
+ * ISC License
+ *
+ * Copyright (c) 2007-2015 Frank DENIS <j at pureftpd.org>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
diff --git a/deduper/libpuzzle/ChangeLog b/deduper/libpuzzle/ChangeLog
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/deduper/libpuzzle/ChangeLog
diff --git a/deduper/libpuzzle/Makefile.am b/deduper/libpuzzle/Makefile.am
new file mode 100644
index 0000000..fce7f7b
--- /dev/null
+++ b/deduper/libpuzzle/Makefile.am
@@ -0,0 +1,11 @@
+AUTOMAKE_OPTIONS = gnu
+
+EXTRA_DIST = \
+ autogen.sh \
+ THANKS \
+ README-PHP
+
+SUBDIRS = \
+ src \
+ man \
+ php
diff --git a/deduper/libpuzzle/NEWS b/deduper/libpuzzle/NEWS
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/deduper/libpuzzle/NEWS
diff --git a/deduper/libpuzzle/README b/deduper/libpuzzle/README
new file mode 100644
index 0000000..502a1c0
--- /dev/null
+++ b/deduper/libpuzzle/README
@@ -0,0 +1,202 @@
+
+ .:. LIBPUZZLE .:.
+
+ http://libpuzzle.pureftpd.org
+
+
+ ------------------------ BLURB ------------------------
+
+
+The Puzzle library is designed to quickly find visually similar images (gif,
+png, jpg), even if they have been resized, recompressed, recolored or slightly
+modified.
+
+The library is free, lightweight yet very fast, configurable, easy to use and
+it has been designed with security in mind. This is a C library, but it also
+comes with a command-line tool and PHP bindings.
+
+
+ ------------------------ REFERENCE ------------------------
+
+
+The Puzzle library is a implementation of "An image signature for any kind of
+image", by H. CHI WONG, Marschall BERN and David GOLDBERG.
+
+
+ ------------------------ COMPILATION ------------------------
+
+
+In order to load images, the library relies on the GD2 library.
+You need to install gdlib2 and its development headers before compiling
+libpuzzle.
+The GD2 library is available as a pre-built package for most operating systems.
+Debian and Ubuntu users should install the "libgd2-dev" or the "libgd2-xpm-dev"
+package.
+Gentoo users should install "media-libs/gd".
+OpenBSD, NetBSD and DragonflyBSD users should install the "gd" package.
+MacPorts users should install the "gd2" package.
+X11 support is not required for the Puzzle library.
+
+Once GD2 has been installed, configure the Puzzle library as usual:
+
+./configure
+
+This is a standard autoconf script, if you're not familiar with it, please
+have a look at the INSTALL file.
+
+Compile the beast:
+
+make
+
+Try the built-in tests:
+
+make check
+
+If everything looks fine, install the software:
+
+make install
+
+If anything goes wrong, please submit a bug report to:
+ libpuzzle [at] pureftpd [dot] org
+
+
+ ------------------------ USAGE ------------------------
+
+
+The API is documented in the libpuzzle(3) and puzzle_set(3) man pages.
+You can also play with the puzzle-diff test application.
+See puzzle-diff(8) for more info about the puzzle-diff application.
+
+In order to be thread-safe, every exported function of the library requires a
+PuzzleContext object. That object stores various run-time tunables.
+
+Out of a bitmap picture, the Puzzle library can fill a PuzzleCVec object :
+
+ PuzzleContext context;
+ PuzzleCVec cvec;
+
+ puzzle_init_context(&context);
+ puzzle_init_cvec(&context, &cvec);
+ puzzle_fill_cvec_from_file(&context, &cvec, "directory/filename.jpg");
+
+The PuzzleCvec structure holds two fields:
+ signed char *vec: a pointer to the first element of the vector
+ size_t sizeof_vec: the number of elements
+
+The size depends on the "lambdas" value (see puzzle_set(3)).
+
+PuzzleCvec structures can be compared:
+
+ d = puzzle_vector_normalized_distance(&context, &cvec1, &cvec2, 1);
+
+d is the normalized distance between both vectors. If d is below 0.6, pictures
+are probably similar.
+
+If you need further help, feel free to subscribe to the mailing-list (see
+below).
+
+
+ ------------------------ INDEXING ------------------------
+
+
+How to quickly find similar pictures, if they are millions of records?
+
+The original paper has a simple, yet efficient answer.
+
+Cut the vector in fixed-length words. For instance, let's consider the
+following vector:
+
+[ a b c d e f g h i j k l m n o p q r s t u v w x y z ]
+
+With a word length (K) of 10, you can get the following words:
+
+[ a b c d e f g h i j ] found at position 0
+[ b c d e f g h i j k ] found at position 1
+[ c d e f g h i j k l ] found at position 2
+etc. until position N-1
+
+Then, index your vector with a compound index of (word + position).
+
+Even with millions of images, K = 10 and N = 100 should be enough to have very
+little entries sharing the same index.
+
+Here's a very basic sample database schema:
+
++-----------------------------+
+| signatures |
++-----------------------------+
+| sig_id | signature | pic_id |
++--------+-----------+--------+
+
++--------------------------+
+| words |
++--------------------------+
+| pos_and_word | fk_sig_id |
++--------------+-----------+
+
+I'd recommend splitting at least the "words" table into multiple tables and/or
+servers.
+
+By default (lambas=9) signatures are 544 bytes long. In order to save storage
+space, they can be compressed to 1/third of their original size through the
+puzzle_compress_cvec() function. Before use, they must be uncompressed with
+puzzle_uncompress_cvec().
+
+
+ ------------------------ PUZZLE-DIFF ------------------------
+
+
+A command-line tool is also available for scripting or testing.
+
+It is installed as "puzzle-diff" and comes with a man page.
+
+Sample usage:
+
+- Output distance between two images:
+
+$ puzzle-diff pic-a-0.jpg pics-a-1.jpg
+0.102286
+
+- Compare two images, exit with 10 if they look the same, exit with 20 if
+they don't (may be useful for scripts):
+
+$ puzzle-diff -e pic-a-0.jpg pics-a-1.jpg
+$ echo $?
+10
+
+- Compute distance, without cropping and with computing the average intensity
+of the whole blocks:
+
+$ puzzle-diff -p 1.0 -c pic-a-0.jpg pic-a-1.jpg
+0.0523151
+
+
+ ------------------------ COMPARING IMAGES WITH PHP ------------------------
+
+
+A PHP extension is bundled with the Libpuzzle package, and it provides PHP
+bindings to most functions of the library.
+
+Documentation for the Libpuzzle PHP extension is available in the README-PHP
+file.
+
+
+ ------------------------ APPS USING LIBPUZZLE ------------------------
+
+
+Here are third-party projects using libpuzzle:
+
+* ftwin - http://jok.is-a-geek.net/ftwin.php
+ ftwin is a tool useful to find duplicate files according to their content on
+your file system.
+
+* Python bindings for libpuzzle: PyPuzzle
+ https://github.com/ArchangelSDY/PyPuzzle
+
+
+ ------------------------ STATUS ------------------------
+
+
+This project is unfortunately not maintained any more. Pull requests are
+always welcome, but I don't use this library any more and I don't have enough
+spare time to actively work on it.
diff --git a/deduper/libpuzzle/README-PHP b/deduper/libpuzzle/README-PHP
new file mode 100644
index 0000000..6b14fb9
--- /dev/null
+++ b/deduper/libpuzzle/README-PHP
@@ -0,0 +1,76 @@
+
+ .:. LIBPUZZLE - PHP EXTENSION .:.
+
+ http://libpuzzle.pureftpd.org
+
+
+ ------------------------ PHP EXTENSION ------------------------
+
+
+The Puzzle library can also be used through PHP, using a native extension.
+
+Prerequisites are the PHP headers, libtool, autoconf and automake.
+
+Here are the basic steps in order to install the extension:
+
+(on OpenBSD: export AUTOMAKE_VERSION=1.9 ; export AUTOCONF_VERSION=2.61)
+
+cd php/libpuzzle
+phpize
+./configure --with-libpuzzle
+make clean
+make
+make install
+
+If libpuzzle is installed in a non-standard location, use:
+./configure --with-libpuzzle=/base/directory/for/libpuzzle
+
+Then edit your php.ini file and add:
+
+extension=libpuzzle.so
+
+
+ ------------------------ USAGE ------------------------
+
+
+The PHP extension provides bindings for the following tuning functions:
+- puzzle_set_max_width()
+- puzzle_set_max_height()
+- puzzle_set_lambdas()
+- puzzle_set_noise_cutoff()
+- puzzle_set_p_ratio()
+- puzzle_set_contrast_barrier_for_cropping()
+- puzzle_set_max_cropping_ratio()
+- puzzle_set_autocrop()
+
+Have a look at the puzzle_set man page for more info about those.
+
+Getting the signature of a picture is as simple as:
+
+$signature = puzzle_fill_cvec_from_file($filename);
+
+In order to compute the similarity between two pictures using their
+signatures, use:
+
+$d = puzzle_vector_normalized_distance($signature1, $signature2);
+
+The result is between 0.0 and 1.0, with 0.6 being a good threshold to detect
+visually similar pictures.
+
+The PUZZLE_CVEC_SIMILARITY_THRESHOLD, PUZZLE_CVEC_SIMILARITY_HIGH_THRESHOLD,
+PUZZLE_CVEC_SIMILARITY_LOW_THRESHOLD and PUZZLE_CVEC_SIMILARITY_LOWER_THRESHOLD
+constants can also be used to get common thresholds :
+
+if ($d < PUZZLE_CVEC_SIMILARITY_THRESHOLD) {
+ echo "Pictures look similar\n";
+}
+
+Before storing a signature into a database, you can compress it in order to
+save some storage space:
+
+$compressed_signature = puzzle_compress_cvec($signature);
+
+Before use, those compressed signatures must be uncompressed with:
+
+$signature = puzzle_uncompress_cvec($compressed_signature);
+
diff --git a/deduper/libpuzzle/THANKS b/deduper/libpuzzle/THANKS
new file mode 100644
index 0000000..86ef2e1
--- /dev/null
+++ b/deduper/libpuzzle/THANKS
@@ -0,0 +1,6 @@
+Xerox Research Center
+H. CHI WONG
+Marschall BERN
+David GOLDBERG
+Sameh CHAFIK
+Gregory MAXWELL
diff --git a/deduper/libpuzzle/autogen.sh b/deduper/libpuzzle/autogen.sh
new file mode 100755
index 0000000..4717fc4
--- /dev/null
+++ b/deduper/libpuzzle/autogen.sh
@@ -0,0 +1,17 @@
+#! /bin/sh
+
+if [ -x "`which autoreconf 2>/dev/null`" ] ; then
+ exec autoreconf -ivf
+fi
+
+if glibtoolize --version > /dev/null 2>&1; then
+ LIBTOOLIZE='glibtoolize'
+else
+ LIBTOOLIZE='libtoolize'
+fi
+
+$LIBTOOLIZE && \
+aclocal && \
+autoheader && \
+automake --add-missing --force-missing --include-deps && \
+autoconf
diff --git a/deduper/libpuzzle/composer.json b/deduper/libpuzzle/composer.json
new file mode 100644
index 0000000..4cd00e2
--- /dev/null
+++ b/deduper/libpuzzle/composer.json
@@ -0,0 +1,10 @@
+{
+ "name": "jedisct1/libpuzzle",
+ "description": "A library to quickly find visually similar images.",
+ "version": "0.10.0",
+ "license": "MIT",
+ "type": "library",
+ "require": {
+ "php": "5.*"
+ }
+}
diff --git a/deduper/libpuzzle/configure.ac b/deduper/libpuzzle/configure.ac
new file mode 100644
index 0000000..1abf0f6
--- /dev/null
+++ b/deduper/libpuzzle/configure.ac
@@ -0,0 +1,70 @@
+# -*- Autoconf -*-
+# Process this file with autoconf to produce a configure script.
+
+AC_PREREQ(2.61)
+AC_INIT(libpuzzle, 0.11, bugs@pureftpd.org)
+AC_CONFIG_SRCDIR([src/puzzle.h])
+AC_CONFIG_HEADER([config.h])
+AM_INIT_AUTOMAKE([1.9 dist-bzip2])
+AM_MAINTAINER_MODE
+
+# Checks for programs.
+AC_PROG_CXX
+AC_PROG_CC
+AC_PROG_CPP
+AC_PROG_INSTALL
+AC_PROG_LN_S
+AC_PROG_MAKE_SET
+AC_PATH_PROG(GDLIBCONFIG, [gdlib-config])
+CPPFLAGS="$CPPFLAGS -D_GNU_SOURCE=1"
+CPPFLAGS="$CPPFLAGS `$GDLIBCONFIG --cflags`"
+LDFLAGS="$LDFLAGS `$GDLIBCONFIG --ldflags`"
+LDADD="$LDADD `$GDLIBCONFIG --libs`"
+
+# Checks for libraries.
+
+AC_CHECK_LIB([gd], [gdImageCreateFromGd2],,
+ AC_ERROR([libgd2 development files not found]))
+
+# Checks for header files.
+AC_HEADER_STDC
+AM_PROG_LIBTOOL
+AC_CHECK_HEADERS([limits.h memory.h stddef.h stdlib.h string.h unistd.h])
+
+# Checks for typedefs, structures, and compiler characteristics.
+AC_C_CONST
+AC_TYPE_SIZE_T
+AC_TYPE_SSIZE_T
+AC_TYPE_OFF_T
+
+# Checks for library functions.
+AC_FUNC_MALLOC
+AC_FUNC_REALLOC
+AC_FUNC_MEMCMP
+AC_CHECK_FUNC([floor], ,[AC_CHECK_LIB([math], [floor])])
+AC_CHECK_FUNC([round], ,[AC_CHECK_LIB([math], [round])])
+AC_CHECK_FUNCS([strtoul])
+
+AC_SUBST([MAINT])
+
+AC_CONFIG_FILES([Makefile
+ man/Makefile
+ src/Makefile
+ src/pics/Makefile
+ php/Makefile
+ php/libpuzzle/Makefile
+ php/libpuzzle/include/Makefile
+ php/libpuzzle/modules/Makefile
+ php/libpuzzle/build/Makefile
+ php/libpuzzle/tests/Makefile
+ php/libpuzzle/tests/pics/Makefile
+ php/examples/Makefile
+ php/examples/similar/Makefile
+ ])
+AC_OUTPUT
+
+AC_MSG_NOTICE([+-------------------------------------------------------+])
+AC_MSG_NOTICE([| You can subscribe to the Libpuzzle users mailing-list |])
+AC_MSG_NOTICE([| to ask for help and to stay informed of new releases. |])
+AC_MSG_NOTICE([| Go to http://libpuzzle.pureftpd.org/ml/ now! |])
+AC_MSG_NOTICE([+-------------------------------------------------------+])
diff --git a/deduper/libpuzzle/man/Makefile.am b/deduper/libpuzzle/man/Makefile.am
new file mode 100644
index 0000000..a3a78a5
--- /dev/null
+++ b/deduper/libpuzzle/man/Makefile.am
@@ -0,0 +1,7 @@
+man_MANS = \
+ libpuzzle.3 \
+ puzzle_set.3 \
+ puzzle-diff.8
+
+EXTRA_DIST = \
+ $(man_MANS)
diff --git a/deduper/libpuzzle/man/libpuzzle.3 b/deduper/libpuzzle/man/libpuzzle.3
new file mode 100644
index 0000000..98cfcbb
--- /dev/null
+++ b/deduper/libpuzzle/man/libpuzzle.3
@@ -0,0 +1,296 @@
+.\"
+.\" Copyright (c) 2007-2014 Frank DENIS <j at pureftpd.org>
+.\"
+.\" Permission to use, copy, modify, and distribute this software for any
+.\" purpose with or without fee is hereby granted, provided that the above
+.\" copyright notice and this permission notice appear in all copies.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+.\"
+.Dd $Mdocdate: March 31 2011 $
+.Dt LIBPUZZLE 3
+.Sh NAME
+.Nm puzzle_init_cvec ,
+.Nm puzzle_init_dvec ,
+.Nm puzzle_fill_dvec_from_file ,
+.Nm puzzle_fill_cvec_from_file ,
+.Nm puzzle_fill_dvec_from_mem ,
+.Nm puzzle_fill_cvec_from_mem ,
+.Nm puzzle_fill_cvec_from_dvec ,
+.Nm puzzle_free_cvec ,
+.Nm puzzle_free_dvec ,
+.Nm puzzle_init_compressed_cvec ,
+.Nm puzzle_free_compressed_cvec ,
+.Nm puzzle_compress_cvec ,
+.Nm puzzle_uncompress_cvec ,
+.Nm puzzle_vector_normalized_distance
+.Nd compute comparable signatures of bitmap images.
+.Sh SYNOPSIS
+.Fd #include <puzzle.h>
+.Ft void
+.Fn puzzle_init_context "PuzzleContext *context"
+.Ft void
+.Fn puzzle_free_context "PuzzleContext *context"
+.Ft void
+.Fn puzzle_init_cvec "PuzzleContext *context" "PuzzleCvec *cvec"
+.Ft void
+.Fn puzzle_init_dvec "PuzzleContext *context" "PuzzleDvec *dvec"
+.Ft int
+.Fn puzzle_fill_dvec_from_file "PuzzleContext *context" "PuzzleDvec * dvec" "const char *file"
+.Ft int
+.Fn puzzle_fill_cvec_from_file "PuzzleContext *context" "PuzzleCvec * cvec" "const char *file"
+.Ft int
+.Fn puzzle_fill_dvec_from_mem "PuzzleContext *context" "PuzzleDvec * dvec" "const void *mem" "size_t size"
+.Ft int
+.Fn puzzle_fill_cvec_from_mem "PuzzleContext *context" "PuzzleCvec * cvec" "const void *mem" "size_t size"
+.Ft int
+.Fn puzzle_fill_cvec_from_dvec "PuzzleContext *context" "PuzzleCvec * cvec" "const PuzzleDvec *dvec"
+.Ft void
+.Fn puzzle_free_cvec "PuzzleContext *context" "PuzzleCvec *cvec"
+.Ft void
+.Fn puzzle_free_dvec "PuzzleContext *context" "PuzzleDvec *dvec"
+.Ft void
+.Fn puzzle_init_compressed_cvec "PuzzleContext *context" "PuzzleCompressedCvec * compressed_cvec"
+.Ft void
+.Fn puzzle_free_compressed_cvec "PuzzleContext *context" "PuzzleCompressedCvec * compressed_cvec"
+.Ft int
+.Fn puzzle_compress_cvec "PuzzleContext *context" "PuzzleCompressedCvec * compressed_cvec" "const PuzzleCvec * cvec"
+.Ft int
+.Fn puzzle_uncompress_cvec "PuzzleContext *context" "PuzzleCompressedCvec * compressed_cvec" "PuzzleCvec * const cvec"
+.Ft double
+.Fn puzzle_vector_normalized_distance "PuzzleContext *context" "const PuzzleCvec * cvec1" "const PuzzleCvec * cvec2" "int fix_for_texts"
+.Sh DESCRIPTION
+The Puzzle library computes a signature out of a bitmap picture.
+Signatures are comparable and similar pictures have similar signatures.
+.Pp
+After a picture has been loaded and uncompressed, featureless parts of
+the image are skipped (autocrop), unless that step has been explicitely
+disabled, see
+.Xr puzzle_set 3
+.Sh LIBPUZZLE CONTEXT
+Every public function requires a
+.Va PuzzleContext
+object, that stores every required tunables.
+.Pp
+Any application using libpuzzle should initialize a
+.Va PuzzleContext
+object with
+.Fn puzzle_init_context
+and free it after use with
+.Fn puzzle_free_context
+.Bd \-literal \-offset indent
+PuzzleContext context;
+
+puzzle_init_context(&context);
+ ...
+puzzle_free_context(&context);
+.Ed
+.Sh DVEC AND CVEC VECTORS
+The next step is to divide the cropped image into a grid and to compute
+the average intensity of soft\(hyedged pixels in every block. The result is a
+.Va PuzzleDvec
+object.
+.Pp
+.Va PuzzleDvec
+objects should be initialized before use, with
+.Fn puzzle_init_dvec
+and freed after use with
+.Fn puzzle_free_dvec
+.Pp
+The
+.Va PuzzleDvec
+structure has two important fields:
+.Va vec
+is the pointer to the first element of the array containing the average
+intensities, and
+.Va sizeof_compressed_vec
+is the number of elements.
+.Pp
+.Va PuzzleDvec
+objects are not comparable, so what you usually want is to transform these
+objects into
+.Va PuzzleCvec
+objects.
+.Pp
+A
+.Va PuzzleCvec
+object is a vector with relationships between adjacent blocks from a
+.Va PuzzleDvec
+object.
+.Pp
+The
+.Fn puzzle_fill_cvec_from_dvec
+fills a
+.Va PuzzleCvec
+object from a
+.Va PuzzleDvec
+object.
+.Pp
+But just like the other structure,
+.Va PuzzleCvec
+objects must be initialized and freed with
+.Fn puzzle_init_cvec
+and
+.Fn puzzle_free_cvec
+.Pp
+.Va PuzzleCvec
+objects have a vector whoose first element is in the
+.Va vec
+field, and the number of elements is in the
+.Va sizeof_vec
+field
+.Sh LOADING PICTURES
+.Va PuzzleDvec
+and
+.Va PuzzleCvec
+objects can be computed from a bitmap picture file, with
+.Fn puzzle_fill_dvec_from_file
+and
+.Fn puzzle_fill_cvec_from_file
+.Pp
+.Em GIF
+,
+.Em PNG
+and
+.Em JPEG
+files formats are currently supported and automatically recognized.
+.Pp
+Here's a simple example that creates a
+.Va PuzzleCvec
+objects out of a file.
+.Bd \-literal \-offset indent
+PuzzleContext context;
+PuzzleCvec cvec;
+
+puzzle_init_context(&context);
+puzzle_init_cvec(&context, &cvec);
+puzzle_fill_cvec_from_file(&context, &cvec, "test\-picture.jpg");
+ ...
+puzzle_free_cvec(&context, &cvec);
+puzzle_free_context(&context);
+.Ed
+.Sh COMPARING VECTORS
+In order to check whether two pictures are similar, you need to compare their
+.Va PuzzleCvec
+signatures, using
+.Fn puzzle_vector_normalized_distance
+.Pp
+That function returns a distance, between 0.0 and 1.0. The lesser, the nearer.
+.Pp
+Tests on common pictures show that a normalized distance of 0.6 (also defined as
+.Va PUZZLE_CVEC_SIMILARITY_THRESHOLD
+) means that both pictures are visually similar.
+.Pp
+If that threshold is not right for your set of pictures, you can experiment
+with
+.Va PUZZLE_CVEC_SIMILARITY_HIGH_THRESHOLD
+,
+.Va PUZZLE_CVEC_SIMILARITY_LOW_THRESHOLD
+and
+.Va PUZZLE_CVEC_SIMILARITY_LOWER_THRESHOLD
+or with your own value.
+.Pp
+If the
+.Fa fix_for_texts
+of
+.Fn puzzle_vector_normalized_distance
+is
+.Em 1
+, a fix is applied to the computation in order to deal with bitmap pictures
+that contain text. That fix is recommended, as it allows using the same
+threshold for that kind of picture as for generic pictures.
+.Pp
+If
+.Fa fix_for_texts
+is
+.Em 0
+, that special way of computing the normalized distance is disabled.
+.Bd \-literal \-offset indent
+PuzzleContext context;
+PuzzleCvec cvec1, cvec2;
+double d;
+
+puzzle_init_context(&context);
+puzzle_init_cvec(&context, &cvec1);
+puzzle_init_cvec(&context, &cvec2);
+puzzle_fill_cvec_from_file(&context, &cvec1, "test\-picture\-1.jpg");
+puzzle_fill_cvec_from_file(&context, &cvec2, "test\-picture\-2.jpg");
+d = puzzle_vector_normalized_distance(&context, &cvec1, &cvec2, 1);
+if (d < PUZZLE_CVEC_SIMILARITY_THRESHOLD) {
+ puts("Pictures are similar");
+}
+puzzle_free_cvec(&context, &cvec2);
+puzzle_free_cvec(&context, &cvec1);
+puzzle_free_context(&context);
+.Ed
+.Sh CVEC COMPRESSION
+In order to reduce storage needs,
+.Va PuzzleCvec
+objects can be compressed to 1/3 of their original size.
+.Pp
+.Va PuzzleCompressedCvec
+structures hold the compressed data. Before and after use, these structures
+have to be passed to
+.Fn puzzle_init_compressed_cvec
+and
+.Fn puzzle_free_compressed_cvec
+.Pp
+.Fn puzzle_compress_cvec
+compresses a
+.Va PuzzleCvec
+object into a
+.Va PuzzleCompressedCvec
+object.
+.Pp
+And
+.Fn puzzle_uncompress_cvec
+uncompresses a
+.Va PuzzleCompressedCvec
+object into a
+.Va PuzzleCvec
+object.
+.Bd \-literal \-offset indent
+PuzzleContext context;
+PuzzleCvec cvec;
+PuzzleCompressedCvec c_cvec;
+ ...
+puzzle_init_compressed_cvec(&context, &c_cvec);
+puzzle_compress_cvec(&context, &c_cvec, &cvec);
+ ...
+puzzle_free_compressed_cvec(&context, &c_cvec);
+.Ed
+The
+.Va PuzzleCompressedCvec
+structure has two important fields:
+.Va vec
+that is a pointer to the first element of the compressed data, and
+.Va sizeof_compressed_vec
+that contains the number of elements.
+.Sh RETURN VALUE
+Functions return
+.Em 0
+on success, and
+.Em \-1
+if something went wrong.
+.Sh AUTHORS
+.Nf
+Frank DENIS
+libpuzzle at pureftpd dot org
+.Fi
+.Sh ACKNOWLEDGMENTS
+.Nf
+Xerox Research Center
+H. CHI WONG
+Marschall BERN
+David GOLDBERG
+Sameh SCHAFIK
+.Fi
+.Sh SEE ALSO
+.Xr puzzle_set 3
+.Xr puzzle\-diff 8
diff --git a/deduper/libpuzzle/man/puzzle-diff.8 b/deduper/libpuzzle/man/puzzle-diff.8
new file mode 100644
index 0000000..5744b5a
--- /dev/null
+++ b/deduper/libpuzzle/man/puzzle-diff.8
@@ -0,0 +1,58 @@
+.\"
+.\" Copyright (c) 2007-2014 Frank DENIS <j at pureftpd.org>
+.\"
+.\" Permission to use, copy, modify, and distribute this software for any
+.\" purpose with or without fee is hereby granted, provided that the above
+.\" copyright notice and this permission notice appear in all copies.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+.\"
+.Dd $Mdocdate: September 23 2007 $
+.Dt PUZZLE-DIFF 1
+.Os
+.Sh NAME
+.Nm puzzle\-diff
+.Nd compare pictures with libpuzzle
+.Sh SYNOPSIS
+.Nm puzzle\-diff
+[\-b <contrast barrier for cropping] [\-c] [\-C <max cropping ratio>]
+[\-e] [\-E <similarity threshold>] [\-h] [\-H <max height>] [\-l <lambdas>]
+[\-n <noise cutoff>] [\-p <p ratio>] [\-t] [\-W <max width>]
+<file 1>
+<file 2>
+.Sh DESCRIPTION
+puzzle\-diff compares two pictures and outputs the normalized distance.
+.Pp
+Try
+.Em puzzle\-diff \-h
+for more info.
+.Sh EXAMPLES
+Output distance between two images:
+.Bd -literal -offset indent
+$ puzzle\-diff pic\-a\-0.jpg pics\-a\-1.jpg
+0.102286
+.Ed
+.Pp
+Compare two images, exit with 10 if they look the same, exit with 20 if
+they don't (may be useful for scripts):
+.Bd -literal -offset indent
+$ puzzle\-diff \-e pic\-a\-0.jpg pics\-a\-1.jpg
+$ echo $?
+10
+.Ed
+.Pp
+Compute distance, without cropping and with computing the average intensity
+of the whole blocks:
+.Bd -literal -offset indent
+$ puzzle\-diff \-p 1.0 \-c pic\-a\-0.jpg pic\-a\-1.jpg
+0.0523151
+.Ed
+.Sh SEE ALSO
+.Xr libpuzzle 3
+.Xr puzzle_set 3
diff --git a/deduper/libpuzzle/man/puzzle_set.3 b/deduper/libpuzzle/man/puzzle_set.3
new file mode 100644
index 0000000..a8d017b
--- /dev/null
+++ b/deduper/libpuzzle/man/puzzle_set.3
@@ -0,0 +1,129 @@
+.\"
+.\" Copyright (c) 2007-2014 Frank DENIS <j at pureftpd.org>
+.\"
+.\" Permission to use, copy, modify, and distribute this software for any
+.\" purpose with or without fee is hereby granted, provided that the above
+.\" copyright notice and this permission notice appear in all copies.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+.\"
+.Dd $Mdocdate: September 24 2007 $
+.Dt PUZZLE_SET 3
+.Sh NAME
+.Nm puzzle_set_max_width ,
+.Nm puzzle_set_max_height ,
+.Nm puzzle_set_lambdas ,
+.Nm puzzle_set_p_ratio ,
+.Nm puzzle_set_noise_cutoff ,
+.Nm puzzle_set_contrast_barrier_for_cropping ,
+.Nm puzzle_set_max_cropping_ratio ,
+.Nm puzzle_set_autocrop
+.Nd set tunables for libpuzzle functions.
+.Sh SYNOPSIS
+.Fd #include <puzzle.h>
+.Ft int
+.Fn puzzle_set_max_width "PuzzleContext *context" "unsigned int width"
+.Ft int
+.Fn puzzle_set_max_height "PuzzleContext *context" "unsigned int height"
+.Ft int
+.Fn puzzle_set_lambdas "PuzzleContext *context" "unsigned int lambdas"
+.Ft int
+.Fn puzzle_set_p_ratio "PuzzleContext *context" "double p_ratio"
+.Ft int
+.Fn puzzle_set_noise_cutoff "PuzzleContext *context" "double noise_cutoff"
+.Ft int
+.Fn puzzle_set_contrast_barrier_for_cropping "PuzzleContext *context" "double barrier"
+.Ft int
+.Fn puzzle_set_max_cropping_ratio "PuzzleContext *context" "double ratio"
+.Ft int
+.Fn puzzle_set_autocrop "PuzzleContext *context" "int enable"
+.Sh DESCRIPTION
+While default values have been chosen to be ok for most people, the
+.Fn puzzle_set_*
+functions are knobs to fit the algorithm to your set of data and to your
+applications.
+.Sh LAMBDAS
+By default, pictures are divided in 9 x 9 blocks.
+.Pp
+.Em 9
+is the
+.Em lambdas
+value, and it can be changed with
+.Fn puzzle_set_lambdas
+.Pp
+For large databases, for complex images, for images with a lot of text or
+for sets of near\(hysimilar images, it might be better to raise that value to
+.Em 11
+or even
+.Em 13
+.Pp
+However, raising that value obviously means that vectors will require more
+storage space.
+.Pp
+The
+.Em lambdas
+value should remain the same in order to get comparable vectors. So if you
+pick
+.Em 11
+(for instance), you should always use that value for all pictures you will
+compute a digest for.
+.Fn puzzle_set_p_ratio
+.Pp
+The average intensity of each block is based upon a small centered zone.
+.Pp
+The "p ratio" determines the size of that zone. The default is 2.0, and that
+ratio mimics the behavior that is described in the reference algorithm.
+.Pp
+For very specific cases (complex images) or if you get too many false
+positives, as an alternative to increasing lambdas, you can try to lower that
+value, for instance to 1.5.
+.Pp
+The lowest acceptable value is 1.0.
+.Sh MAXIMUM SIZES
+In order to avoid CPU starvation, pictures won't be processed if their width
+or height is larger than 3000 pixels.
+.Pp
+These limits are rather large, but if you ever need to change them, the
+.Fn puzzle_set_max_width
+and
+.Fn puzzle_set_max_height
+are available.
+.Sh NOISE CUTOFF
+The noise cutoff defaults to 2. If you raise that value, more zones with
+little difference of intensity will be considered as similar.
+.Pp
+Unless you have very specialized sets of pictures, you probably don't want
+to change this.
+.Sh AUTOCROP
+By default, featureless borders of the original image are ignored. The size
+of each border depends on the sum of absolute values of differences between
+adjacent pixels, relative to the total sum.
+.Pp
+That feature can be disabled with
+.Fn puzzle_set_autocrop "0"
+Any other value will enable it.
+.Pp
+.Fn puzzle_set_contrast_barrier_for_cropping
+changes the tolerance. The default value is 5. Less shaves less, more shaves
+more.
+.Pp
+.Fn puzzle_set_max_cropping_ratio
+This is a safe\(hyguard against unwanted excessive auto\(hycropping.
+.Pp
+The default (0.25) means that no more than 25% of the total width (or
+height) will ever be shaved.
+.Sh RETURN VALUE
+Functions return
+.Em 0
+on success, and
+.Em \-1
+if something went wrong.
+.Sh SEE ALSO
+.Xr libpuzzle 3
+.Xr puzzle\-diff 8
diff --git a/deduper/libpuzzle/php/Makefile.am b/deduper/libpuzzle/php/Makefile.am
new file mode 100644
index 0000000..dc0165f
--- /dev/null
+++ b/deduper/libpuzzle/php/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = \
+ libpuzzle \
+ examples
diff --git a/deduper/libpuzzle/php/examples/Makefile.am b/deduper/libpuzzle/php/examples/Makefile.am
new file mode 100644
index 0000000..82c81ba
--- /dev/null
+++ b/deduper/libpuzzle/php/examples/Makefile.am
@@ -0,0 +1,2 @@
+SUBDIRS = \
+ similar
diff --git a/deduper/libpuzzle/php/examples/similar/Makefile.am b/deduper/libpuzzle/php/examples/similar/Makefile.am
new file mode 100644
index 0000000..126f6df
--- /dev/null
+++ b/deduper/libpuzzle/php/examples/similar/Makefile.am
@@ -0,0 +1,6 @@
+EXTRA_DIST = \
+ schema.sqlite3.sql \
+ schema.pgsql.sql \
+ similar.php \
+ similar.inc.php \
+ config.inc.php
diff --git a/deduper/libpuzzle/php/examples/similar/config.inc.php b/deduper/libpuzzle/php/examples/similar/config.inc.php
new file mode 100644
index 0000000..d4e3b41
--- /dev/null
+++ b/deduper/libpuzzle/php/examples/similar/config.inc.php
@@ -0,0 +1,9 @@
+<?php
+
+define('MAX_IMAGE_SIZE', 1024 * 1024 * 4);
+define('MAX_URL_SIZE', 255);
+define('DB_DSN', 'sqlite:similar.sqlite3');
+define('MAX_WORDS', 100);
+define('MAX_WORD_LENGTH', 10);
+
+?>
diff --git a/deduper/libpuzzle/php/examples/similar/schema.pgsql.sql b/deduper/libpuzzle/php/examples/similar/schema.pgsql.sql
new file mode 100644
index 0000000..7dc6bc1
--- /dev/null
+++ b/deduper/libpuzzle/php/examples/similar/schema.pgsql.sql
@@ -0,0 +1,230 @@
+--
+-- PostgreSQL database dump
+--
+
+SET client_encoding = 'UTF8';
+SET standard_conforming_strings = off;
+SET check_function_bodies = false;
+SET client_min_messages = warning;
+SET escape_string_warning = off;
+
+SET SESSION AUTHORIZATION 'similar';
+
+--
+-- Name: SCHEMA public; Type: COMMENT; Schema: -; Owner: similar
+--
+
+COMMENT ON SCHEMA public IS 'Standard public schema';
+
+
+SET search_path = public, pg_catalog;
+
+SET default_tablespace = '';
+
+SET default_with_oids = false;
+
+--
+-- Name: pictures; Type: TABLE; Schema: public; Owner: similar; Tablespace:
+--
+
+CREATE TABLE pictures (
+ id integer NOT NULL,
+ digest character(32) NOT NULL,
+ CONSTRAINT ck_digest CHECK ((char_length(digest) = 32))
+);
+
+
+--
+-- Name: pictures_id_seq; Type: SEQUENCE; Schema: public; Owner: similar
+--
+
+CREATE SEQUENCE pictures_id_seq
+ START WITH 1
+ INCREMENT BY 1
+ NO MAXVALUE
+ NO MINVALUE
+ CACHE 1;
+
+
+--
+-- Name: pictures_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: similar
+--
+
+ALTER SEQUENCE pictures_id_seq OWNED BY pictures.id;
+
+
+--
+-- Name: sentpictures; Type: TABLE; Schema: public; Owner: similar; Tablespace:
+--
+
+CREATE TABLE sentpictures (
+ id integer NOT NULL,
+ url character varying(255) NOT NULL,
+ sender character varying(100) NOT NULL,
+ picture_id integer NOT NULL,
+ CONSTRAINT ck_url CHECK (((url)::text <> ''::text))
+);
+
+
+--
+-- Name: sentpictures_id_seq; Type: SEQUENCE; Schema: public; Owner: similar
+--
+
+CREATE SEQUENCE sentpictures_id_seq
+ START WITH 1
+ INCREMENT BY 1
+ NO MAXVALUE
+ NO MINVALUE
+ CACHE 1;
+
+
+--
+-- Name: sentpictures_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: similar
+--
+
+ALTER SEQUENCE sentpictures_id_seq OWNED BY sentpictures.id;
+
+
+--
+-- Name: signatures; Type: TABLE; Schema: public; Owner: similar; Tablespace:
+--
+
+CREATE TABLE signatures (
+ id integer NOT NULL,
+ compressed_signature bytea NOT NULL,
+ picture_id integer NOT NULL,
+ CONSTRAINT ck_signature CHECK ((octet_length(compressed_signature) >= 182))
+);
+
+
+--
+-- Name: signatures_id_seq; Type: SEQUENCE; Schema: public; Owner: similar
+--
+
+CREATE SEQUENCE signatures_id_seq
+ START WITH 1
+ INCREMENT BY 1
+ NO MAXVALUE
+ NO MINVALUE
+ CACHE 1;
+
+
+--
+-- Name: signatures_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: similar
+--
+
+ALTER SEQUENCE signatures_id_seq OWNED BY signatures.id;
+
+
+--
+-- Name: words; Type: TABLE; Schema: public; Owner: similar; Tablespace:
+--
+
+CREATE TABLE words (
+ pos_and_word bytea NOT NULL,
+ signature_id integer NOT NULL,
+ CONSTRAINT ck_pos_and_word CHECK ((octet_length(pos_and_word) >= 2))
+);
+
+
+--
+-- Name: id; Type: DEFAULT; Schema: public; Owner: similar
+--
+
+ALTER TABLE pictures ALTER COLUMN id SET DEFAULT nextval('pictures_id_seq'::regclass);
+
+
+--
+-- Name: id; Type: DEFAULT; Schema: public; Owner: similar
+--
+
+ALTER TABLE sentpictures ALTER COLUMN id SET DEFAULT nextval('sentpictures_id_seq'::regclass);
+
+
+--
+-- Name: id; Type: DEFAULT; Schema: public; Owner: similar
+--
+
+ALTER TABLE signatures ALTER COLUMN id SET DEFAULT nextval('signatures_id_seq'::regclass);
+
+
+--
+-- Name: pictures_pkey; Type: CONSTRAINT; Schema: public; Owner: similar; Tablespace:
+--
+
+ALTER TABLE ONLY pictures
+ ADD CONSTRAINT pictures_pkey PRIMARY KEY (id);
+
+
+--
+-- Name: sentpictures_pkey; Type: CONSTRAINT; Schema: public; Owner: similar; Tablespace:
+--
+
+ALTER TABLE ONLY sentpictures
+ ADD CONSTRAINT sentpictures_pkey PRIMARY KEY (id);
+
+
+--
+-- Name: signatures_pkey; Type: CONSTRAINT; Schema: public; Owner: similar; Tablespace:
+--
+
+ALTER TABLE ONLY signatures
+ ADD CONSTRAINT signatures_pkey PRIMARY KEY (id);
+
+
+--
+-- Name: idx_digest; Type: INDEX; Schema: public; Owner: similar; Tablespace:
+--
+
+CREATE UNIQUE INDEX idx_digest ON pictures USING btree (digest);
+
+
+--
+-- Name: idx_picture_id; Type: INDEX; Schema: public; Owner: similar; Tablespace:
+--
+
+CREATE INDEX idx_picture_id ON sentpictures USING btree (picture_id);
+
+
+--
+-- Name: idx_pos_and_word; Type: INDEX; Schema: public; Owner: similar; Tablespace:
+--
+
+CREATE INDEX idx_pos_and_word ON words USING btree (pos_and_word);
+
+
+--
+-- Name: idx_url; Type: INDEX; Schema: public; Owner: similar; Tablespace:
+--
+
+CREATE UNIQUE INDEX idx_url ON sentpictures USING btree (url);
+
+
+--
+-- Name: sentpictures_picture_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: similar
+--
+
+ALTER TABLE ONLY sentpictures
+ ADD CONSTRAINT sentpictures_picture_id_fkey FOREIGN KEY (picture_id) REFERENCES pictures(id) ON UPDATE CASCADE ON DELETE CASCADE;
+
+
+--
+-- Name: signatures_picture_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: similar
+--
+
+ALTER TABLE ONLY signatures
+ ADD CONSTRAINT signatures_picture_id_fkey FOREIGN KEY (picture_id) REFERENCES pictures(id) ON UPDATE CASCADE ON DELETE CASCADE;
+
+
+--
+-- Name: words_signature_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: similar
+--
+
+ALTER TABLE ONLY words
+ ADD CONSTRAINT words_signature_id_fkey FOREIGN KEY (signature_id) REFERENCES signatures(id) ON UPDATE CASCADE ON DELETE CASCADE;
+
+
+--
+-- PostgreSQL database dump complete
+--
+
diff --git a/deduper/libpuzzle/php/examples/similar/schema.sqlite3.sql b/deduper/libpuzzle/php/examples/similar/schema.sqlite3.sql
new file mode 100644
index 0000000..dc5a6c3
--- /dev/null
+++ b/deduper/libpuzzle/php/examples/similar/schema.sqlite3.sql
@@ -0,0 +1,23 @@
+CREATE TABLE pictures (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ digest CHAR(32) NOT NULL
+);
+CREATE TABLE sentpictures (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ url VARCHAR(255) NOT NULL,
+ sender VARCHAR(100) NOT NULL,
+ picture_id INTEGER NOT NULL
+);
+CREATE TABLE signatures (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ compressed_signature CHAR(182) NOT NULL,
+ picture_id INTEGER NOT NULL
+);
+CREATE TABLE words (
+ pos_and_word CHAR(5) NOT NULL,
+ signature_id INTEGER NOT NULL
+);
+CREATE UNIQUE INDEX idx_digest ON pictures(digest);
+CREATE INDEX idx_picture_id ON sentpictures (picture_id);
+CREATE INDEX idx_pos_and_word ON words(pos_and_word);
+CREATE UNIQUE INDEX idx_url ON sentpictures (url);
diff --git a/deduper/libpuzzle/php/examples/similar/similar.inc.php b/deduper/libpuzzle/php/examples/similar/similar.inc.php
new file mode 100644
index 0000000..cfc806e
--- /dev/null
+++ b/deduper/libpuzzle/php/examples/similar/similar.inc.php
@@ -0,0 +1,120 @@
+<?php
+
+function split_into_words($sig) {
+ $words = array();
+ $u = 0;
+ do {
+ $words[$u] = substr($sig, $u, MAX_WORD_LENGTH);
+ } while (++$u < MAX_WORDS);
+
+ return $words;
+}
+
+function save_signature($url, $client_info, $md5, $cvec) {
+ $compressed_cvec = puzzle_compress_cvec($cvec);
+ $words = split_into_words($cvec);
+ $dbh = new PDO(DB_DSN);
+ $dbh->beginTransaction();
+ try {
+ $st = $dbh->prepare
+ ('DELETE FROM sentpictures WHERE url = :url');
+ $st->execute(array(':url' => $url));
+ $st = $dbh->prepare
+ ('SELECT id FROM pictures WHERE digest = :digest');
+ $st->execute(array(':digest' => $md5));
+ $picture_id = $st->fetchColumn();
+ $st->closeCursor();
+ $duplicate = TRUE;
+ if ($picture_id === FALSE) {
+ $duplicate = FALSE;
+ $st = $dbh->prepare
+ ('INSERT INTO pictures (digest) VALUES (:digest)');
+ $st->execute(array(':digest' => $md5));
+ $picture_id = $dbh->lastInsertId('id');
+ }
+ $st = $dbh->prepare
+ ('INSERT INTO sentpictures (url, sender, picture_id) ' .
+ 'VALUES (:url, :sender, :picture_id)');
+ $st->execute(array(':url' => $url, ':sender' => $client_info,
+ ':picture_id' => $picture_id));
+ if ($duplicate === TRUE) {
+ $dbh->commit();
+ return TRUE;
+ }
+ $st = $dbh->prepare
+ ('INSERT INTO signatures (compressed_signature, picture_id) ' .
+ 'VALUES(:compressed_signature, :picture_id)');
+ $st->execute(array(':compressed_signature' => $compressed_cvec,
+ ':picture_id' => $picture_id));
+ $signature_id = $dbh->lastInsertId('id');
+ $st = $dbh->prepare
+ ('INSERT INTO words (pos_and_word, signature_id) ' .
+ 'VALUES (:pos_and_word, :signature_id)');
+ foreach ($words as $u => $word) {
+ $st->execute(array('pos_and_word'
+ => chr($u) . puzzle_compress_cvec($word),
+ 'signature_id' => $signature_id));
+ }
+ $dbh->commit();
+ } catch (Exception $e) {
+ var_dump($e);
+ $dbh->rollback();
+ }
+ return TRUE;
+}
+
+function find_similar_pictures($md5, $cvec,
+ $threshold = PUZZLE_CVEC_SIMILARITY_THRESHOLD) {
+ $compressed_cvec = puzzle_compress_cvec($cvec);
+ $words = split_into_words($cvec);
+ $dbh = new PDO(DB_DSN);
+ $dbh->beginTransaction();
+ $sql = 'SELECT DISTINCT(signature_id) AS signature_id FROM words ' .
+ 'WHERE pos_and_word IN (';
+ $coma = FALSE;
+ foreach ($words as $u => $word) {
+ if ($coma === TRUE) {
+ $sql .= ',';
+ }
+ $sql .= $dbh->quote(chr($u) . puzzle_compress_cvec($word));
+ $coma = TRUE;
+ }
+ $sql .= ')';
+ $res_words = $dbh->query($sql);
+ $scores = array();
+ $st = $dbh->prepare('SELECT compressed_signature, picture_id ' .
+ 'FROM signatures WHERE id = :id');
+ while (($signature_id = $res_words->fetchColumn()) !== FALSE) {
+ $st->execute(array(':id' => $signature_id));
+ $row = $st->fetch();
+ $found_compressed_signature = $row['compressed_signature'];
+ $picture_id = $row['picture_id'];
+ $found_cvec = puzzle_uncompress_cvec($found_compressed_signature);
+ $distance = puzzle_vector_normalized_distance($cvec, $found_cvec);
+ if ($distance < $threshold && $distance > 0.0) {
+ $scores[$picture_id] = $distance;
+ }
+ }
+ $sql = 'SELECT url FROM sentpictures WHERE picture_id IN (';
+ $coma = FALSE;
+ foreach ($scores as $picture_id => $score) {
+ if ($coma === TRUE) {
+ $sql .= ',';
+ }
+ $sql .= $dbh->quote($picture_id);
+ $coma = TRUE;
+ }
+ $sql .= ')';
+ $urls = array();
+ if (!empty($scores)) {
+ $res_urls = $dbh->query($sql);
+ while (($url = $res_urls->fetchColumn()) !== FALSE) {
+ array_push($urls, $url);
+ }
+ }
+ $dbh->commit();
+
+ return $urls;
+}
+
+?>
diff --git a/deduper/libpuzzle/php/examples/similar/similar.php b/deduper/libpuzzle/php/examples/similar/similar.php
new file mode 100644
index 0000000..4b3ad40
--- /dev/null
+++ b/deduper/libpuzzle/php/examples/similar/similar.php
@@ -0,0 +1,158 @@
+<html><!-- sample image search engine, part of the libpuzzle package -->
+<head>
+</head>
+<body>
+<h1>Similar images finder using <a href="http://libpuzzle.pureftpd.org">libpuzzle</a></h1>
+<?php
+
+error_reporting(E_ALL);
+
+require_once 'config.inc.php';
+require_once 'similar.inc.php';
+
+function display_form() {
+ echo '<form action="' . htmlspecialchars($_SERVER['REQUEST_URI']) . '" ' .
+ 'method="POST">' . "\n";
+ echo 'Enter an image URL (http only):' . "\n";
+ echo '<input type="text" size="100" value="" autocomplete="off" name="url" />' . "\n";
+ echo '<input type="submit" />';
+ echo '</form>' . "\n";
+}
+
+function display_error($err) {
+ echo '<div id="err"><strong>' . htmlspecialchars($err) . '</strong></div>' . "\n";
+}
+
+function display_loading() {
+ echo '<div id="loading">Loading...</div>' . "\n";
+ @ob_flush(); flush();
+}
+
+function display_loaded() {
+ echo '<div id="loaded">Loaded.</div>' . "\n";
+ @ob_flush(); flush();
+}
+
+function display_signature_ok() {
+ echo '<div id="sig-ok">Signature computed.</div>' . "\n";
+ @ob_flush(); flush();
+}
+
+function remove_tmpfile($file) {
+ @unlink($file);
+}
+
+function get_client_info() {
+ return @$_SERVER['REMOTE_ADDR'] . '/' . time();
+}
+
+function display_similar_pictures($urls) {
+ echo '<div id="images">' . "\n";
+ foreach ($urls as $url) {
+ echo '<a href="' . htmlentities($url) . '" ' .
+ 'onclick="window.open(this.href); return false;">';
+ echo ' <img src="' . htmlentities($url) . '" alt="" />';
+ echo '</a>' . "\n";
+
+ }
+ echo '</div>' . "\n";
+}
+
+function record_url($url, &$md5, &$cvec) {
+ if (function_exists('sys_get_temp_dir')) {
+ $tmpdir = sys_get_temp_dir();
+ } else {
+ $tmpdir = '/tmp';
+ }
+ $dfn = tempnam($tmpdir, 'similar-' . md5(uniqid(mt_rand(), TRUE)));
+ register_shutdown_function('remove_tmpfile', $dfn);
+ if (($dfp = fopen($dfn, 'w')) == FALSE) {
+ display_form();
+ display_error('Unable to create the temporary file');
+ return FALSE;
+ }
+ if (($fp = fopen($url, 'r')) == FALSE) {
+ display_form();
+ display_error('Unable to open: [' . $url . ']');
+ return FALSE;
+ }
+ $f = fread($fp, 4096);
+ $written = strlen($f);
+ if (empty($f)) {
+ display_form();
+ display_error('Unable to load: [' . $url . ']');
+ return FALSE;
+ }
+ fwrite($dfp, $f);
+ $infos = @getimagesize($dfn);
+ if (empty($infos) ||
+ ($infos[2] !== IMAGETYPE_GIF && $infos[2] !== IMAGETYPE_JPEG &&
+ $infos[2] !== IMAGETYPE_PNG) ||
+ $infos[0] < 50 || $infos[1] < 50) {
+ fclose($dfp);
+ display_form();
+ display_error('Unsupported image format');
+ return FALSE;
+ }
+ fseek($dfp, strlen($f));
+ while (!feof($fp)) {
+ $max = MAX_IMAGE_SIZE - $written;
+ if ($max > 65536) {
+ $max = 65536;
+ }
+ $t = fread($fp, $max);
+ fwrite($dfp, $t);
+ $written += strlen($t);
+ if ($written > MAX_IMAGE_SIZE) {
+ fclose($dfp);
+ display_form();
+ display_error('File too large');
+ return FALSE;
+ }
+ }
+ unset($t);
+ fclose($dfp);
+ display_loaded();
+ $md5 = @md5_file($dfn);
+ if (empty($md5)) {
+ display_form();
+ display_error('Unable to get the MD5 of the file');
+ return FALSE;
+ }
+ $cvec = puzzle_fill_cvec_from_file($dfn);
+ if (empty($cvec)) {
+ display_form();
+ display_error('Unable to compute image signature');
+ return FALSE;
+ }
+ display_signature_ok();
+ save_signature($url, get_client_info(), $md5, $cvec);
+
+ return TRUE;
+}
+
+$url = trim(@$_POST['url']);
+if (empty($url)) {
+ display_form();
+ exit(0);
+}
+if (strlen($url) > MAX_URL_SIZE ||
+ preg_match('£^http://([a-z0-9-]+[.])+[a-z]{2,}/.£i', $url) <= 0) {
+ display_form();
+ display_error('Invalid URL, must be http://...');
+ exit(1);
+}
+display_loading();
+$md5 = FALSE;
+$cvec = FALSE;
+if (record_url($url, $md5, $cvec) !== TRUE) {
+ exit(1);
+}
+$urls = find_similar_pictures($md5, $cvec);
+unset($cvec);
+display_form();
+display_similar_pictures($urls);
+
+?>
+</body>
+</html>
diff --git a/deduper/libpuzzle/php/libpuzzle/CREDITS b/deduper/libpuzzle/php/libpuzzle/CREDITS
new file mode 100644
index 0000000..bb6ecb3
--- /dev/null
+++ b/deduper/libpuzzle/php/libpuzzle/CREDITS
@@ -0,0 +1 @@
+Frank DENIS <j at pureftpd.org>
diff --git a/deduper/libpuzzle/php/libpuzzle/EXPERIMENTAL b/deduper/libpuzzle/php/libpuzzle/EXPERIMENTAL
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/deduper/libpuzzle/php/libpuzzle/EXPERIMENTAL
diff --git a/deduper/libpuzzle/php/libpuzzle/LICENSE b/deduper/libpuzzle/php/libpuzzle/LICENSE
new file mode 100644
index 0000000..1ce2d05
--- /dev/null
+++ b/deduper/libpuzzle/php/libpuzzle/LICENSE
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2007-2015 Frank DENIS <j at pureftpd.org>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
diff --git a/deduper/libpuzzle/php/libpuzzle/Makefile.am b/deduper/libpuzzle/php/libpuzzle/Makefile.am
new file mode 100644
index 0000000..f582035
--- /dev/null
+++ b/deduper/libpuzzle/php/libpuzzle/Makefile.am
@@ -0,0 +1,15 @@
+EXTRA_DIST = \
+ CREDITS \
+ EXPERIMENTAL \
+ LICENSE \
+ README \
+ config.m4 \
+ libpuzzle.c \
+ libpuzzle.php \
+ php_libpuzzle.h
+
+SUBDIRS = \
+ build \
+ include \
+ modules \
+ tests
diff --git a/deduper/libpuzzle/php/libpuzzle/README b/deduper/libpuzzle/php/libpuzzle/README
new file mode 100644
index 0000000..7bb674f
--- /dev/null
+++ b/deduper/libpuzzle/php/libpuzzle/README
@@ -0,0 +1,4 @@
+This is a PHP extension for libpuzzle.
+
+Have a look at the README-PHP file on top of the libpuzzle distribution for
+more info about that extension.
diff --git a/deduper/libpuzzle/php/libpuzzle/build/Makefile.am b/deduper/libpuzzle/php/libpuzzle/build/Makefile.am
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/deduper/libpuzzle/php/libpuzzle/build/Makefile.am
diff --git a/deduper/libpuzzle/php/libpuzzle/config.m4 b/deduper/libpuzzle/php/libpuzzle/config.m4
new file mode 100644
index 0000000..84f954a
--- /dev/null
+++ b/deduper/libpuzzle/php/libpuzzle/config.m4
@@ -0,0 +1,49 @@
+dnl config.m4 for extension libpuzzle
+
+dnl If your extension references something external, use with:
+
+PHP_ARG_WITH(libpuzzle, for libpuzzle support,
+ [ --with-libpuzzle Include libpuzzle support])
+
+if test "$PHP_LIBPUZZLE" != "no"; then
+ for i in $PHP_LIBPUZZLE /usr/local /usr; do
+ if test -x "$i/bin/gdlib-config"; then
+ GDLIB_CONFIG=$i/bin/gdlib-config
+ break
+ fi
+ done
+ GDLIB_LIBS=$($GDLIB_CONFIG --ldflags --libs)
+ GDLIB_INCS=$($GDLIB_CONFIG --cflags)
+
+ PHP_EVAL_LIBLINE($GDLIB_LIBS, LIBPUZZLE_SHARED_LIBADD)
+ PHP_EVAL_INCLINE($GDLIB_INCS)
+
+ SEARCH_PATH="/usr/local /usr" # you might want to change this
+ SEARCH_FOR="/include/puzzle.h" # you most likely want to change this
+ if test -r $PHP_LIBPUZZLE/$SEARCH_FOR; then # path given as parameter
+ LIBPUZZLE_DIR=$PHP_LIBPUZZLE
+ else # search default path list
+ AC_MSG_CHECKING([for libpuzzle files in default path])
+ for i in $SEARCH_PATH ; do
+ if test -r $i/$SEARCH_FOR; then
+ LIBPUZZLE_DIR=$i
+ AC_MSG_RESULT(found in $i)
+ fi
+ done
+ fi
+
+ if test -z "$LIBPUZZLE_DIR"; then
+ AC_MSG_RESULT([not found])
+ AC_MSG_ERROR([Please reinstall the libpuzzle distribution])
+ fi
+
+ dnl # --with-libpuzzle -> add include path
+ PHP_ADD_INCLUDE($LIBPUZZLE_DIR/include)
+
+ PHP_ADD_LIBRARY_WITH_PATH(puzzle, $LIBPUZZLE_DIR/lib,
+ LIBPUZZLE_SHARED_LIBADD)
+
+ PHP_SUBST(LIBPUZZLE_SHARED_LIBADD)
+
+ PHP_NEW_EXTENSION(libpuzzle, libpuzzle.c, $ext_shared)
+fi
diff --git a/deduper/libpuzzle/php/libpuzzle/include/Makefile.am b/deduper/libpuzzle/php/libpuzzle/include/Makefile.am
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/deduper/libpuzzle/php/libpuzzle/include/Makefile.am
diff --git a/deduper/libpuzzle/php/libpuzzle/libpuzzle.c b/deduper/libpuzzle/php/libpuzzle/libpuzzle.c
new file mode 100644
index 0000000..82e84c3
--- /dev/null
+++ b/deduper/libpuzzle/php/libpuzzle/libpuzzle.c
@@ -0,0 +1,410 @@
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "php.h"
+#include "php_ini.h"
+#include "ext/standard/info.h"
+#include <puzzle.h>
+#include "php_libpuzzle.h"
+
+ZEND_DECLARE_MODULE_GLOBALS(libpuzzle)
+
+/* True global resources - no need for thread safety here */
+static int le_libpuzzle;
+
+/* {{{ libpuzzle_functions[]
+ */
+zend_function_entry libpuzzle_functions[] = {
+ PHP_FE(puzzle_set_max_width, NULL)
+ PHP_FE(puzzle_set_max_height, NULL)
+ PHP_FE(puzzle_set_lambdas, NULL)
+ PHP_FE(puzzle_set_noise_cutoff, NULL)
+ PHP_FE(puzzle_set_p_ratio, NULL)
+ PHP_FE(puzzle_set_contrast_barrier_for_cropping, NULL)
+ PHP_FE(puzzle_set_max_cropping_ratio, NULL)
+ PHP_FE(puzzle_set_autocrop, NULL)
+
+ PHP_FE(puzzle_fill_cvec_from_file, NULL)
+ PHP_FE(puzzle_compress_cvec, NULL)
+ PHP_FE(puzzle_uncompress_cvec, NULL)
+ PHP_FE(puzzle_vector_normalized_distance, NULL)
+
+ {NULL, NULL, NULL} /* Must be the last line in libpuzzle_functions[] */
+};
+/* }}} */
+
+/* {{{ libpuzzle_module_entry
+ */
+zend_module_entry libpuzzle_module_entry = {
+#if ZEND_MODULE_API_NO >= 20010901
+ STANDARD_MODULE_HEADER,
+#endif
+ "libpuzzle",
+ libpuzzle_functions,
+ PHP_MINIT(libpuzzle),
+ PHP_MSHUTDOWN(libpuzzle),
+ PHP_RINIT(libpuzzle), /* Replace with NULL if there's nothing to do at request start */
+ PHP_RSHUTDOWN(libpuzzle), /* Replace with NULL if there's nothing to do at request end */
+ PHP_MINFO(libpuzzle),
+#if ZEND_MODULE_API_NO >= 20010901
+ "0.10", /* Replace with version number for your extension */
+#endif
+ STANDARD_MODULE_PROPERTIES
+};
+/* }}} */
+
+#ifdef COMPILE_DL_LIBPUZZLE
+ZEND_GET_MODULE(libpuzzle)
+#endif
+
+
+/* {{{ PHP_MINIT_FUNCTION
+ */
+PHP_MINIT_FUNCTION(libpuzzle)
+{
+ REGISTER_DOUBLE_CONSTANT("PUZZLE_CVEC_SIMILARITY_THRESHOLD",
+ PUZZLE_CVEC_SIMILARITY_THRESHOLD,
+ CONST_CS | CONST_PERSISTENT);
+ REGISTER_DOUBLE_CONSTANT("PUZZLE_CVEC_SIMILARITY_HIGH_THRESHOLD",
+ PUZZLE_CVEC_SIMILARITY_HIGH_THRESHOLD,
+ CONST_CS | CONST_PERSISTENT);
+ REGISTER_DOUBLE_CONSTANT("PUZZLE_CVEC_SIMILARITY_LOW_THRESHOLD",
+ PUZZLE_CVEC_SIMILARITY_LOW_THRESHOLD,
+ CONST_CS | CONST_PERSISTENT);
+ REGISTER_DOUBLE_CONSTANT("PUZZLE_CVEC_SIMILARITY_LOWER_THRESHOLD",
+ PUZZLE_CVEC_SIMILARITY_LOWER_THRESHOLD,
+ CONST_CS | CONST_PERSISTENT);
+ return SUCCESS;
+}
+/* }}} */
+
+/* {{{ PHP_MSHUTDOWN_FUNCTION
+ */
+PHP_MSHUTDOWN_FUNCTION(libpuzzle)
+{
+ return SUCCESS;
+}
+/* }}} */
+
+/* Remove if there's nothing to do at request start */
+/* {{{ PHP_RINIT_FUNCTION
+ */
+PHP_RINIT_FUNCTION(libpuzzle)
+{
+ puzzle_init_context(&LIBPUZZLE_G(global_context));
+ return SUCCESS;
+}
+/* }}} */
+
+/* Remove if there's nothing to do at request end */
+/* {{{ PHP_RSHUTDOWN_FUNCTION
+ */
+PHP_RSHUTDOWN_FUNCTION(libpuzzle)
+{
+ puzzle_free_context(&LIBPUZZLE_G(global_context));
+ return SUCCESS;
+}
+/* }}} */
+
+/* {{{ PHP_MINFO_FUNCTION
+ */
+PHP_MINFO_FUNCTION(libpuzzle)
+{
+ php_info_print_table_start();
+ php_info_print_table_header(2, "libpuzzle support", "enabled");
+ php_info_print_table_end();
+}
+/* }}} */
+
+/* {{{ proto string puzzle_fill_cvec_from_file(string filename)
+ * Creates a signature out of an image file */
+PHP_FUNCTION(puzzle_fill_cvec_from_file)
+{
+ char *arg = NULL;
+ int arg_len;
+ PuzzleContext *context;
+ PuzzleCvec cvec;
+
+ context = &LIBPUZZLE_G(global_context);
+ if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC,
+ "s", &arg, &arg_len) == FAILURE ||
+ arg_len <= 0) {
+ RETURN_FALSE;
+ }
+ puzzle_init_cvec(context, &cvec);
+ if (puzzle_fill_cvec_from_file(context, &cvec, arg) != 0) {
+ puzzle_free_cvec(context, &cvec);
+ RETURN_FALSE;
+ }
+ RETVAL_STRINGL(cvec.vec, cvec.sizeof_vec, 1);
+ puzzle_free_cvec(context, &cvec);
+}
+/* }}} */
+
+/* {{{ proto string puzzle_compress_cvec(string cvec)
+ * Compress a signature to save storage space */
+PHP_FUNCTION(puzzle_compress_cvec)
+{
+ char *arg = NULL;
+ int arg_len;
+ PuzzleContext *context;
+ PuzzleCompressedCvec compressed_cvec;
+ PuzzleCvec cvec;
+
+ context = &LIBPUZZLE_G(global_context);
+ if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC,
+ "s", &arg, &arg_len) == FAILURE ||
+ arg_len <= 0) {
+ RETURN_FALSE;
+ }
+ puzzle_init_compressed_cvec(context, &compressed_cvec);
+ puzzle_init_cvec(context, &cvec);
+ cvec.vec = arg;
+ cvec.sizeof_vec = (size_t) arg_len;
+ if (puzzle_compress_cvec(context, &compressed_cvec, &cvec) != 0) {
+ puzzle_free_compressed_cvec(context, &compressed_cvec);
+ cvec.vec = NULL;
+ puzzle_free_cvec(context, &cvec);
+ RETURN_FALSE;
+ }
+ RETVAL_STRINGL(compressed_cvec.vec,
+ compressed_cvec.sizeof_compressed_vec, 1);
+ puzzle_free_compressed_cvec(context, &compressed_cvec);
+ cvec.vec = NULL;
+ puzzle_free_cvec(context, &cvec);
+}
+/* }}} */
+
+/* {{{ proto string puzzle_uncompress_cvec(string compressed_cvec)
+ * Uncompress a compressed signature so that it can be used for computations */
+PHP_FUNCTION(puzzle_uncompress_cvec)
+{
+ char *arg = NULL;
+ int arg_len;
+ PuzzleContext *context;
+ PuzzleCompressedCvec compressed_cvec;
+ PuzzleCvec cvec;
+
+ context = &LIBPUZZLE_G(global_context);
+ if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC,
+ "s", &arg, &arg_len) == FAILURE ||
+ arg_len <= 0) {
+ RETURN_FALSE;
+ }
+ puzzle_init_compressed_cvec(context, &compressed_cvec);
+ puzzle_init_cvec(context, &cvec);
+ compressed_cvec.vec = arg;
+ compressed_cvec.sizeof_compressed_vec = (size_t) arg_len;
+ if (puzzle_uncompress_cvec(context, &compressed_cvec, &cvec) != 0) {
+ puzzle_free_cvec(context, &cvec);
+ compressed_cvec.vec = NULL;
+ puzzle_free_compressed_cvec(context, &compressed_cvec);
+ RETURN_FALSE;
+ }
+ RETVAL_STRINGL(cvec.vec, cvec.sizeof_vec, 1);
+ puzzle_free_cvec(context, &cvec);
+ compressed_cvec.vec = NULL;
+ puzzle_free_compressed_cvec(context, &compressed_cvec);
+}
+/* }}} */
+
+/* {{{ proto double puzzle_vector_normalized_distance(string cvec1, string cvec2 [, bool fix_for_texts])
+ * Computes the distance between two signatures. Result is between 0.0 and 1.0 */
+PHP_FUNCTION(puzzle_vector_normalized_distance)
+{
+ char *vec1 = NULL, *vec2 = NULL;
+ int vec1_len, vec2_len;
+ PuzzleContext *context;
+ PuzzleCvec cvec1, cvec2;
+ double d;
+ zend_bool fix_for_texts;
+
+ context = &LIBPUZZLE_G(global_context);
+ if (zend_parse_parameters
+ (ZEND_NUM_ARGS() TSRMLS_CC, "ss|b",
+ &vec1, &vec1_len, &vec2, &vec2_len, &fix_for_texts) == FAILURE ||
+ vec1_len <= 0 || vec2_len <= 0) {
+ RETURN_FALSE;
+ }
+ if (ZEND_NUM_ARGS() TSRMLS_CC < 3) {
+ fix_for_texts = (zend_bool) 1;
+ }
+ puzzle_init_cvec(context, &cvec1);
+ puzzle_init_cvec(context, &cvec2);
+ cvec1.vec = vec1;
+ cvec1.sizeof_vec = (size_t) vec1_len;
+ cvec2.vec = vec2;
+ cvec2.sizeof_vec = (size_t) vec2_len;
+ d = puzzle_vector_normalized_distance(context, &cvec1, &cvec2,
+ (int) fix_for_texts);
+ cvec1.vec = cvec2.vec = NULL;
+ puzzle_free_cvec(context, &cvec1);
+ puzzle_free_cvec(context, &cvec2);
+ RETVAL_DOUBLE(d);
+}
+/* }}} */
+
+/* {{{ proto bool puzzle_set_max_width(int width)
+ * Set the maximum picture width */
+PHP_FUNCTION(puzzle_set_max_width)
+{
+ PuzzleContext *context;
+ long width;
+
+ context = &LIBPUZZLE_G(global_context);
+ if (zend_parse_parameters
+ (ZEND_NUM_ARGS() TSRMLS_CC, "l", &width) == FAILURE ||
+ width <= 0L || width > INT_MAX) {
+ RETURN_FALSE;
+ }
+ if (puzzle_set_max_width(context, (unsigned int) width) != 0) {
+ RETURN_FALSE;
+ }
+ RETVAL_TRUE;
+}
+/* }}} */
+
+/* {{{ proto bool puzzle_set_max_height(int height)
+ * Set the maximum picture height */
+PHP_FUNCTION(puzzle_set_max_height)
+{
+ PuzzleContext *context;
+ long height;
+
+ context = &LIBPUZZLE_G(global_context);
+ if (zend_parse_parameters
+ (ZEND_NUM_ARGS() TSRMLS_CC, "l", &height) == FAILURE ||
+ height <= 0L || height > INT_MAX) {
+ RETURN_FALSE;
+ }
+ if (puzzle_set_max_height(context, (unsigned int) height) != 0) {
+ RETURN_FALSE;
+ }
+ RETVAL_TRUE;
+}
+/* }}} */
+
+/* {{{ proto bool puzzle_set_lambdas(int lambdas)
+ * Set the size of the computation grid */
+PHP_FUNCTION(puzzle_set_lambdas)
+{
+ PuzzleContext *context;
+ long lambdas;
+
+ context = &LIBPUZZLE_G(global_context);
+ if (zend_parse_parameters
+ (ZEND_NUM_ARGS() TSRMLS_CC, "l", &lambdas) == FAILURE ||
+ lambdas <= 0L || lambdas > INT_MAX) {
+ RETURN_FALSE;
+ }
+ if (puzzle_set_lambdas(context, (unsigned int) lambdas) != 0) {
+ RETURN_FALSE;
+ }
+ RETVAL_TRUE;
+}
+/* }}} */
+
+/* {{{ proto bool puzzle_set_noise_cutoff(double cutoff)
+ * Set the noise cutoff level */
+PHP_FUNCTION(puzzle_set_noise_cutoff)
+{
+ PuzzleContext *context;
+ double cutoff;
+
+ context = &LIBPUZZLE_G(global_context);
+ if (zend_parse_parameters
+ (ZEND_NUM_ARGS() TSRMLS_CC, "d", &cutoff) == FAILURE) {
+ RETURN_FALSE;
+ }
+ if (puzzle_set_noise_cutoff(context, cutoff) != 0) {
+ RETURN_FALSE;
+ }
+ RETVAL_TRUE;
+}
+/* }}} */
+
+/* {{{ proto bool puzzle_set_p_ratio(double ratio)
+ * Set the p_ratio */
+PHP_FUNCTION(puzzle_set_p_ratio)
+{
+ PuzzleContext *context;
+ double p_ratio;
+
+ context = &LIBPUZZLE_G(global_context);
+ if (zend_parse_parameters
+ (ZEND_NUM_ARGS() TSRMLS_CC, "d", &p_ratio) == FAILURE) {
+ RETURN_FALSE;
+ }
+ if (puzzle_set_p_ratio(context, p_ratio) != 0) {
+ RETURN_FALSE;
+ }
+ RETVAL_TRUE;
+}
+/* }}} */
+
+/* {{{ proto bool puzzle_set_contrast_barrier_for_cropping(double barrier)
+ * Set the tolerance level for cropping */
+PHP_FUNCTION(puzzle_set_contrast_barrier_for_cropping)
+{
+ PuzzleContext *context;
+ double barrier;
+
+ context = &LIBPUZZLE_G(global_context);
+ if (zend_parse_parameters
+ (ZEND_NUM_ARGS() TSRMLS_CC, "d", &barrier) == FAILURE) {
+ RETURN_FALSE;
+ }
+ if (puzzle_set_contrast_barrier_for_cropping(context, barrier) != 0) {
+ RETURN_FALSE;
+ }
+ RETVAL_TRUE;
+}
+/* }}} */
+
+/* {{{ proto bool puzzle_set_max_cropping_ratio(double ratio)
+ * Set the maximum ratio between the cropped area and the whole picture */
+PHP_FUNCTION(puzzle_set_max_cropping_ratio)
+{
+ PuzzleContext *context;
+ double ratio;
+
+ context = &LIBPUZZLE_G(global_context);
+ if (zend_parse_parameters
+ (ZEND_NUM_ARGS() TSRMLS_CC, "d", &ratio) == FAILURE) {
+ RETURN_FALSE;
+ }
+ if (puzzle_set_max_cropping_ratio(context, ratio) != 0) {
+ RETURN_FALSE;
+ }
+ RETVAL_TRUE;
+}
+/* }}} */
+
+/* {{{ proto bool puzzle_set_autocrop(bool autocrop)
+ * TRUE to enable autocropping, FALSE to disable */
+PHP_FUNCTION(puzzle_set_autocrop)
+{
+ PuzzleContext *context;
+ zend_bool autocrop;
+
+ context = &LIBPUZZLE_G(global_context);
+ if (zend_parse_parameters
+ (ZEND_NUM_ARGS() TSRMLS_CC, "b", &autocrop) == FAILURE) {
+ RETURN_FALSE;
+ }
+ if (puzzle_set_autocrop(context, (int) autocrop) != 0) {
+ RETURN_FALSE;
+ }
+ RETVAL_TRUE;
+}
+/* }}} */
+
+/*
+ * Local variables:
+ * tab-width: 4
+ * c-basic-offset: 4
+ * End:
+ * vim600: noet sw=4 ts=4 fdm=marker
+ * vim<600: noet sw=4 ts=4
+ */
diff --git a/deduper/libpuzzle/php/libpuzzle/libpuzzle.php b/deduper/libpuzzle/php/libpuzzle/libpuzzle.php
new file mode 100644
index 0000000..415273b
--- /dev/null
+++ b/deduper/libpuzzle/php/libpuzzle/libpuzzle.php
@@ -0,0 +1,21 @@
+<?php
+$br = (php_sapi_name() == "cli")? "":"<br>";
+
+if(!extension_loaded('libpuzzle')) {
+ dl('libpuzzle.' . PHP_SHLIB_SUFFIX);
+}
+$module = 'libpuzzle';
+$functions = get_extension_funcs($module);
+echo "Functions available in the test extension:$br\n";
+foreach($functions as $func) {
+ echo $func."$br\n";
+}
+echo "$br\n";
+$function = 'confirm_' . $module . '_compiled';
+if (extension_loaded($module)) {
+ $str = $function($module);
+} else {
+ $str = "Module $module is not compiled into PHP";
+}
+echo "$str\n";
+?>
diff --git a/deduper/libpuzzle/php/libpuzzle/modules/Makefile.am b/deduper/libpuzzle/php/libpuzzle/modules/Makefile.am
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/deduper/libpuzzle/php/libpuzzle/modules/Makefile.am
diff --git a/deduper/libpuzzle/php/libpuzzle/php_libpuzzle.h b/deduper/libpuzzle/php/libpuzzle/php_libpuzzle.h
new file mode 100644
index 0000000..1fae819
--- /dev/null
+++ b/deduper/libpuzzle/php/libpuzzle/php_libpuzzle.h
@@ -0,0 +1,66 @@
+#ifndef PHP_LIBPUZZLE_H
+#define PHP_LIBPUZZLE_H
+
+extern zend_module_entry libpuzzle_module_entry;
+#define phpext_libpuzzle_ptr &libpuzzle_module_entry
+
+#ifdef PHP_WIN32
+#define PHP_LIBPUZZLE_API __declspec(dllexport)
+#else
+#define PHP_LIBPUZZLE_API
+#endif
+
+#ifdef ZTS
+#include "TSRM.h"
+#endif
+
+PHP_MINIT_FUNCTION(libpuzzle);
+PHP_MSHUTDOWN_FUNCTION(libpuzzle);
+PHP_RINIT_FUNCTION(libpuzzle);
+PHP_RSHUTDOWN_FUNCTION(libpuzzle);
+PHP_MINFO_FUNCTION(libpuzzle);
+
+PHP_FUNCTION(puzzle_set_max_width);
+PHP_FUNCTION(puzzle_set_max_height);
+PHP_FUNCTION(puzzle_set_lambdas);
+PHP_FUNCTION(puzzle_set_noise_cutoff);
+PHP_FUNCTION(puzzle_set_p_ratio);
+PHP_FUNCTION(puzzle_set_contrast_barrier_for_cropping);
+PHP_FUNCTION(puzzle_set_max_cropping_ratio);
+PHP_FUNCTION(puzzle_set_autocrop);
+
+PHP_FUNCTION(puzzle_fill_cvec_from_file);
+PHP_FUNCTION(puzzle_compress_cvec);
+PHP_FUNCTION(puzzle_uncompress_cvec);
+PHP_FUNCTION(puzzle_vector_normalized_distance);
+
+ZEND_BEGIN_MODULE_GLOBALS(libpuzzle)
+ PuzzleContext global_context;
+ZEND_END_MODULE_GLOBALS(libpuzzle)
+
+/* In every utility function you add that needs to use variables
+ in php_libpuzzle_globals, call TSRMLS_FETCH(); after declaring other
+ variables used by that function, or better yet, pass in TSRMLS_CC
+ after the last function argument and declare your utility function
+ with TSRMLS_DC after the last declared argument. Always refer to
+ the globals in your function as LIBPUZZLE_G(variable). You are
+ encouraged to rename these macros something shorter, see
+ examples in any other php module directory.
+*/
+
+#ifdef ZTS
+#define LIBPUZZLE_G(v) TSRMG(libpuzzle_globals_id, zend_libpuzzle_globals *, v)
+#else
+#define LIBPUZZLE_G(v) (libpuzzle_globals.v)
+#endif
+
+#endif /* PHP_LIBPUZZLE_H */
+
+/*
+ * Local variables:
+ * tab-width: 4
+ * c-basic-offset: 4
+ * End:
+ * vim600: noet sw=4 ts=4 fdm=marker
+ * vim<600: noet sw=4 ts=4
+ */
diff --git a/deduper/libpuzzle/php/libpuzzle/tests/001.phpt b/deduper/libpuzzle/php/libpuzzle/tests/001.phpt
new file mode 100644
index 0000000..5a5f5b5
--- /dev/null
+++ b/deduper/libpuzzle/php/libpuzzle/tests/001.phpt
@@ -0,0 +1,10 @@
+--TEST--
+Check for libpuzzle presence
+--SKIPIF--
+<?php if (!extension_loaded("libpuzzle")) print "skip"; ?>
+--FILE--
+<?php
+echo "libpuzzle extension is available";
+?>
+--EXPECT--
+libpuzzle extension is available
diff --git a/deduper/libpuzzle/php/libpuzzle/tests/002.phpt b/deduper/libpuzzle/php/libpuzzle/tests/002.phpt
new file mode 100644
index 0000000..d675145
--- /dev/null
+++ b/deduper/libpuzzle/php/libpuzzle/tests/002.phpt
@@ -0,0 +1,15 @@
+--TEST--
+Check for distance between similar images
+--SKIPIF--
+<?php if (!extension_loaded("libpuzzle")) print "skip"; ?>
+--FILE--
+<?php
+
+$cvec1 = puzzle_fill_cvec_from_file(dirname(__FILE__) . '/pics/pic-a-0.jpg');
+$cvec2 = puzzle_fill_cvec_from_file(dirname(__FILE__) . '/pics/pic-a-1.jpg');
+$d = puzzle_vector_normalized_distance($cvec1, $cvec2);
+exit((int) ($d < PUZZLE_CVEC_SIMILARITY_LOWER_THRESHOLD));
+
+?>
+--EXPECT--
+1
diff --git a/deduper/libpuzzle/php/libpuzzle/tests/003.phpt b/deduper/libpuzzle/php/libpuzzle/tests/003.phpt
new file mode 100644
index 0000000..ba7d5aa
--- /dev/null
+++ b/deduper/libpuzzle/php/libpuzzle/tests/003.phpt
@@ -0,0 +1,24 @@
+--TEST--
+Check the puzzle_set(3) interface
+--SKIPIF--
+<?php if (!extension_loaded("libpuzzle")) print "skip"; ?>
+--FILE--
+<?php
+
+$cvec1 = puzzle_fill_cvec_from_file(dirname(__FILE__) . '/pics/pic-a-0.jpg');
+$cvec2 = puzzle_fill_cvec_from_file(dirname(__FILE__) . '/pics/pic-a-1.jpg');
+puzzle_set_max_width(1500);
+puzzle_set_max_height(1500);
+puzzle_set_lambdas(11);
+puzzle_set_noise_cutoff(1.0);
+puzzle_set_p_ratio(2.0);
+puzzle_set_contrast_barrier_for_cropping(0.1);
+puzzle_set_max_cropping_ratio(0.1);
+puzzle_set_autocrop(FALSE);
+
+$d = puzzle_vector_normalized_distance($cvec1, $cvec2);
+exit((int) ($d < PUZZLE_CVEC_SIMILARITY_LOWER_THRESHOLD));
+
+?>
+--EXPECT--
+1
diff --git a/deduper/libpuzzle/php/libpuzzle/tests/Makefile.am b/deduper/libpuzzle/php/libpuzzle/tests/Makefile.am
new file mode 100644
index 0000000..14ded39
--- /dev/null
+++ b/deduper/libpuzzle/php/libpuzzle/tests/Makefile.am
@@ -0,0 +1,7 @@
+EXTRA_DIST = \
+ 001.phpt \
+ 002.phpt \
+ 003.phpt
+
+SUBDIRS = \
+ pics
diff --git a/deduper/libpuzzle/php/libpuzzle/tests/pics/Makefile.am b/deduper/libpuzzle/php/libpuzzle/tests/pics/Makefile.am
new file mode 100644
index 0000000..0aacd9a
--- /dev/null
+++ b/deduper/libpuzzle/php/libpuzzle/tests/pics/Makefile.am
@@ -0,0 +1,3 @@
+EXTRA_DIST = \
+ pic-a-0.jpg \
+ pic-a-1.jpg
diff --git a/deduper/libpuzzle/php/libpuzzle/tests/pics/pic-a-0.jpg b/deduper/libpuzzle/php/libpuzzle/tests/pics/pic-a-0.jpg
new file mode 100644
index 0000000..3dd4a3b
--- /dev/null
+++ b/deduper/libpuzzle/php/libpuzzle/tests/pics/pic-a-0.jpg
Binary files differ
diff --git a/deduper/libpuzzle/php/libpuzzle/tests/pics/pic-a-1.jpg b/deduper/libpuzzle/php/libpuzzle/tests/pics/pic-a-1.jpg
new file mode 100644
index 0000000..95f0e77
--- /dev/null
+++ b/deduper/libpuzzle/php/libpuzzle/tests/pics/pic-a-1.jpg
Binary files differ
diff --git a/deduper/libpuzzle/src/CMakeLists.txt b/deduper/libpuzzle/src/CMakeLists.txt
new file mode 100644
index 0000000..634ef38
--- /dev/null
+++ b/deduper/libpuzzle/src/CMakeLists.txt
@@ -0,0 +1,21 @@
+project(puzzle C)
+
+include(FindPkgConfig)
+pkg_search_module(gdlib REQUIRED gdlib)
+
+add_library(puzzle STATIC
+ globals.h
+ puzzle_common.h
+ puzzle_p.h
+ puzzle.h
+ compress.c
+ cvec.c
+ dvec.c
+ puzzle.c
+ tunables.c
+ vector_ops.c
+)
+target_include_directories(puzzle
+ PRIVATE
+ ${gdlib_INCLUDE_DIRS}
+)
diff --git a/deduper/libpuzzle/src/Makefile.am b/deduper/libpuzzle/src/Makefile.am
new file mode 100644
index 0000000..3016925
--- /dev/null
+++ b/deduper/libpuzzle/src/Makefile.am
@@ -0,0 +1,72 @@
+lib_LTLIBRARIES = \
+ libpuzzle.la
+
+libpuzzle_la_LDFLAGS = -version-info 1:0
+
+libpuzzle_la_SOURCES = \
+ puzzle.c \
+ tunables.c \
+ dvec.c \
+ cvec.c \
+ compress.c \
+ vector_ops.c \
+ puzzle_common.h \
+ puzzle_p.h \
+ globals.h \
+ puzzle.h
+
+include_HEADERS = \
+ puzzle.h
+
+noinst_HEADERS = \
+ puzzle_common.h \
+ puzzle_p.h \
+ globals.h
+
+bin_PROGRAMS = \
+ puzzle-diff
+
+puzzle_diff_SOURCES = \
+ puzzle-diff.c \
+ puzzle_common.h \
+ puzzle.h
+
+puzzle_diff_LDADD = \
+ libpuzzle.la
+
+TESTS = \
+ regress_1 \
+ regress_2 \
+ regress_3
+
+check_PROGRAMS = \
+ regress_1 \
+ regress_2 \
+ regress_3
+
+regress_1_SOURCES = \
+ regress_1.c \
+ puzzle_common.h \
+ puzzle.h
+
+regress_2_SOURCES = \
+ regress_2.c \
+ puzzle_common.h \
+ puzzle.h
+
+regress_3_SOURCES = \
+ regress_3.c \
+ puzzle_common.h \
+ puzzle.h
+
+regress_1_LDADD = \
+ libpuzzle.la
+
+regress_2_LDADD = \
+ libpuzzle.la
+
+regress_3_LDADD = \
+ libpuzzle.la
+
+SUBDIRS = \
+ pics
diff --git a/deduper/libpuzzle/src/compress.c b/deduper/libpuzzle/src/compress.c
new file mode 100644
index 0000000..e71da95
--- /dev/null
+++ b/deduper/libpuzzle/src/compress.c
@@ -0,0 +1,125 @@
+#include "puzzle_common.h"
+#include "puzzle_p.h"
+#include "puzzle.h"
+#include "globals.h"
+
+void puzzle_init_compressed_cvec(PuzzleContext * const context,
+ PuzzleCompressedCvec * const compressed_cvec)
+{
+ (void) context;
+ compressed_cvec->sizeof_compressed_vec = (size_t) 0U;
+ compressed_cvec->vec = NULL;
+}
+
+void puzzle_free_compressed_cvec(PuzzleContext * const context,
+ PuzzleCompressedCvec * const compressed_cvec)
+{
+ (void) context;
+ free(compressed_cvec->vec);
+ compressed_cvec->vec = NULL;
+}
+
+int puzzle_compress_cvec(PuzzleContext * const context,
+ PuzzleCompressedCvec * const compressed_cvec,
+ const PuzzleCvec * const cvec)
+{
+#define PC_NM(X) ((unsigned char) ((X) + 2))
+ size_t remaining = cvec->sizeof_vec;
+ const signed char *ptr;
+ unsigned char *cptr;
+
+ (void) context;
+ compressed_cvec->sizeof_compressed_vec =
+ (cvec->sizeof_vec + (size_t) 2U) / (size_t) 3U;
+ if ((compressed_cvec->vec =
+ calloc(compressed_cvec->sizeof_compressed_vec,
+ sizeof *compressed_cvec->vec)) == NULL) {
+ return -1;
+ }
+ ptr = cvec->vec;
+ cptr = compressed_cvec->vec;
+ while (remaining >= (size_t) 3U) {
+ *cptr++ = PC_NM(ptr[0]) + PC_NM(ptr[1]) * 5U +
+ PC_NM(ptr[2]) * (5U * 5U);
+ ptr += 3U;
+ remaining -= 3U;
+ }
+ if (remaining == (size_t) 1U) {
+ *cptr++ = PC_NM(ptr[0]);
+ compressed_cvec->vec[0] |= 128U;
+ } else if (remaining == (size_t) 2U) {
+ *cptr++ = PC_NM(ptr[0]) + PC_NM(ptr[1]) * 5U;
+ if (compressed_cvec->sizeof_compressed_vec < (size_t) 2U) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ compressed_cvec->vec[1] |= 128U;
+ }
+ if ((size_t) (cptr - compressed_cvec->vec) !=
+ compressed_cvec->sizeof_compressed_vec) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ return 0;
+}
+
+int puzzle_uncompress_cvec(PuzzleContext * const context,
+ const PuzzleCompressedCvec * const compressed_cvec,
+ PuzzleCvec * const cvec)
+{
+#define PC_FL(X) ((X) & 127U)
+#define PC_NP(X) ((signed char) (X) - 2)
+
+ size_t remaining;
+ unsigned char trailing_bits;
+ const unsigned char *cptr = compressed_cvec->vec;
+ signed char *ptr;
+ unsigned char c;
+
+ (void) context;
+ if (cvec->vec != NULL) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ if ((remaining = compressed_cvec->sizeof_compressed_vec) < (size_t) 2U) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ trailing_bits = ((cptr[0] & 128U) >> 7) | ((cptr[1] & 128U) >> 6);
+ if (trailing_bits > 2U) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ cvec->sizeof_vec = (size_t) 3U *
+ (compressed_cvec->sizeof_compressed_vec - trailing_bits) +
+ trailing_bits;
+ if (compressed_cvec->sizeof_compressed_vec >
+ SIZE_MAX / (size_t) 3U - (size_t) 2U) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ if ((cvec->vec = calloc(cvec->sizeof_vec, sizeof *cvec->vec)) == NULL) {
+ return -1;
+ }
+ if (trailing_bits != 0U) {
+ if (remaining <= (size_t) 0U) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ remaining--;
+ }
+ ptr = cvec->vec;
+ while (remaining > (size_t) 0U) {
+ c = PC_FL(*cptr++);
+ *ptr++ = PC_NP(c % 5U);
+ c /= 5U;
+ *ptr++ = PC_NP(c % 5U);
+ c /= 5U;
+ *ptr++ = PC_NP(c % 5U);
+ remaining--;
+ }
+ if (trailing_bits == 1U) {
+ *ptr++ = PC_NP(PC_FL(*cptr) % 5U);
+ } else if (trailing_bits == 2U) {
+ c = PC_FL(*cptr);
+ *ptr++ = PC_NP(c % 5U);
+ *ptr++ = PC_NP(c / 5U % 5U);
+ }
+ if ((size_t) (ptr - cvec->vec) != cvec->sizeof_vec) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ return 0;
+}
diff --git a/deduper/libpuzzle/src/cvec.c b/deduper/libpuzzle/src/cvec.c
new file mode 100644
index 0000000..482b445
--- /dev/null
+++ b/deduper/libpuzzle/src/cvec.c
@@ -0,0 +1,202 @@
+#include "puzzle_common.h"
+#include "puzzle_p.h"
+#include "puzzle.h"
+#include "globals.h"
+
+static int puzzle_median_cmp(const void * const a_, const void * const b_)
+{
+ const double a = * (const double *) a_;
+ const double b = * (const double *) b_;
+
+ if (a < b) {
+ return -1;
+ } else if (a > b) {
+ return 1;
+ }
+ return 0;
+}
+
+static double puzzle_median(double * const vec, size_t size)
+{
+ size_t n;
+ size_t o;
+ double avg;
+
+ if (size <= (size_t) 0U) {
+ return 0.0;
+ }
+ qsort((void *) vec, size, sizeof *vec, puzzle_median_cmp);
+ if ((n = size / (size_t) 2U) == (size_t) 0U) {
+ if (size > (size_t) 1U) {
+ o = (size_t) 1U;
+ } else {
+ o = (size_t) 0U;
+ }
+ } else {
+ o = n + (size_t) 1U;
+ }
+ if (o < n) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ avg = (vec[n] + vec[o]) / 2.0;
+ if (avg < vec[n] || avg > vec[o]) {
+ avg = vec[n];
+ }
+ return avg;
+}
+
+int puzzle_fill_cvec_from_dvec(PuzzleContext * const context,
+ PuzzleCvec * const cvec,
+ const PuzzleDvec * const dvec)
+{
+ size_t s;
+ const double *dvecptr;
+ signed char *cvecptr;
+ double *lights = NULL, *darks = NULL;
+ size_t pos_lights = (size_t) 0U, pos_darks = (size_t) 0U;
+ size_t sizeof_lights, sizeof_darks;
+ double lighter_cutoff, darker_cutoff;
+ int err = 0;
+ double dv;
+
+ if ((cvec->sizeof_vec = dvec->sizeof_compressed_vec) <= (size_t) 0U) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ if ((cvec->vec = calloc(cvec->sizeof_vec, sizeof *cvec->vec)) == NULL) {
+ return -1;
+ }
+ sizeof_lights = sizeof_darks = cvec->sizeof_vec;
+ if ((lights = calloc(sizeof_lights, sizeof *lights)) == NULL ||
+ (darks = calloc(sizeof_darks, sizeof *darks)) == NULL) {
+ err = -1;
+ goto out;
+ }
+ dvecptr = dvec->vec;
+ s = cvec->sizeof_vec;
+ do {
+ dv = *dvecptr++;
+ if (dv >= - context->puzzle_noise_cutoff &&
+ dv <= context->puzzle_noise_cutoff) {
+ continue;
+ }
+ if (dv < context->puzzle_noise_cutoff) {
+ darks[pos_darks++] = dv;
+ if (pos_darks > sizeof_darks) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ } else if (dv > context->puzzle_noise_cutoff) {
+ lights[pos_lights++] = dv;
+ if (pos_lights > sizeof_lights) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ }
+ } while (--s != (size_t) 0U);
+ lighter_cutoff = puzzle_median(lights, pos_lights);
+ darker_cutoff = puzzle_median(darks, pos_darks);
+ free(lights);
+ lights = NULL;
+ free(darks);
+ darks = NULL;
+ dvecptr = dvec->vec;
+ cvecptr = cvec->vec;
+ s = cvec->sizeof_vec;
+ do {
+ dv = *dvecptr++;
+ if (dv >= - context->puzzle_noise_cutoff &&
+ dv <= context->puzzle_noise_cutoff) {
+ *cvecptr++ = 0;
+ } else if (dv < 0.0) {
+ *cvecptr++ = dv < darker_cutoff ? -2 : -1;
+ } else {
+ *cvecptr++ = dv > lighter_cutoff ? +2 : +1;
+ }
+ } while (--s != (size_t) 0U);
+ if ((size_t) (cvecptr - cvec->vec) != cvec->sizeof_vec) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ out:
+ free(lights);
+ free(darks);
+
+ return err;
+}
+
+void puzzle_init_cvec(PuzzleContext * const context, PuzzleCvec * const cvec)
+{
+ (void) context;
+ cvec->sizeof_vec = (size_t) 0U;
+ cvec->vec = NULL;
+}
+
+void puzzle_free_cvec(PuzzleContext * const context, PuzzleCvec * const cvec)
+{
+ (void) context;
+ free(cvec->vec);
+ cvec->vec = NULL;
+}
+
+int puzzle_dump_cvec(PuzzleContext * const context,
+ const PuzzleCvec * const cvec)
+{
+ size_t s = cvec->sizeof_vec;
+ const signed char *vecptr = cvec->vec;
+
+ (void) context;
+ if (s <= (size_t) 0U) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ do {
+ printf("%d\n", *vecptr++);
+ } while (--s != (size_t) 0U);
+
+ return 0;
+}
+
+int puzzle_cvec_cksum(PuzzleContext * const context,
+ const PuzzleCvec * const cvec, unsigned int * const sum)
+{
+ size_t s = cvec->sizeof_vec;
+ const signed char *vecptr = cvec->vec;
+
+ (void) context;
+ *sum = 5381;
+ do {
+ *sum += *sum << 5;
+ *sum ^= (unsigned int) *vecptr++;
+ } while (--s != (size_t) 0U);
+
+ return 0;
+}
+
+int puzzle_fill_cvec_from_file(PuzzleContext * const context,
+ PuzzleCvec * const cvec,
+ const char * const file)
+{
+ PuzzleDvec dvec;
+ int ret;
+
+ puzzle_init_dvec(context, &dvec);
+ if ((ret = puzzle_fill_dvec_from_file(context, &dvec, file)) == 0) {
+ ret = puzzle_fill_cvec_from_dvec(context, cvec, &dvec);
+ }
+ puzzle_free_dvec(context, &dvec);
+
+ return ret;
+}
+
+int puzzle_fill_cvec_from_mem(PuzzleContext * const context,
+ PuzzleCvec * const cvec,
+ const void * const mem,
+ const size_t size)
+{
+ PuzzleDvec dvec;
+ int ret;
+
+ puzzle_init_dvec(context, &dvec);
+ if ((ret = puzzle_fill_dvec_from_mem(context, &dvec, mem, size)) == 0) {
+ ret = puzzle_fill_cvec_from_dvec(context, cvec, &dvec);
+ }
+ puzzle_free_dvec(context, &dvec);
+
+ return ret;
+}
diff --git a/deduper/libpuzzle/src/dvec.c b/deduper/libpuzzle/src/dvec.c
new file mode 100644
index 0000000..f5d21f9
--- /dev/null
+++ b/deduper/libpuzzle/src/dvec.c
@@ -0,0 +1,663 @@
+#include "puzzle_common.h"
+#include "puzzle_p.h"
+#include "puzzle.h"
+#include "globals.h"
+
+static void puzzle_init_view(PuzzleView * const view)
+{
+ view->width = view->height = 0U;
+ view->sizeof_map = (size_t) 0U;
+ view->map = NULL;
+}
+
+static void puzzle_free_view(PuzzleView * const view)
+{
+ free(view->map);
+ view->map = NULL;
+}
+
+static void puzzle_init_avglvls(PuzzleAvgLvls * const avglvls)
+{
+ avglvls->lambdas = 0U;
+ avglvls->sizeof_lvls = (size_t) 0U;
+ avglvls->lvls = NULL;
+}
+
+static void puzzle_free_avglvls(PuzzleAvgLvls * const avglvls)
+{
+ free(avglvls->lvls);
+ avglvls->lvls = NULL;
+}
+
+void puzzle_init_dvec(PuzzleContext * const context, PuzzleDvec * const dvec)
+{
+ (void) context;
+ dvec->sizeof_vec = dvec->sizeof_compressed_vec = (size_t) 0U;
+ dvec->vec = NULL;
+}
+
+void puzzle_free_dvec(PuzzleContext * const context, PuzzleDvec * const dvec)
+{
+ (void) context;
+ free(dvec->vec);
+ dvec->vec = NULL;
+}
+
+#define MAX_SIGNATURE_LENGTH 8U
+
+static PuzzleImageTypeCode puzzle_get_image_type_from_header(const unsigned char * const header)
+{
+ static const PuzzleImageType image_types[] = {
+ { (size_t) 4U, (const unsigned char *)
+ "GIF8", PUZZLE_IMAGE_TYPE_GIF },
+ { (size_t) 3U, (const unsigned char *)
+ "\xff\xd8\xff", PUZZLE_IMAGE_TYPE_JPEG },
+ { (size_t) 8U, (const unsigned char *)
+ "\x89PNG\r\n\x1a\n", PUZZLE_IMAGE_TYPE_PNG },
+ { (size_t) 0U, NULL, PUZZLE_IMAGE_TYPE_UNKNOWN }
+ };
+ const PuzzleImageType *image_type = image_types;
+ PuzzleImageTypeCode ret = PUZZLE_IMAGE_TYPE_UNKNOWN;
+ do {
+ if (image_type->sizeof_signature > MAX_SIGNATURE_LENGTH) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ if (memcmp(header, image_type->signature,
+ image_type->sizeof_signature) == 0) {
+ ret = image_type->image_type_code;
+ break;
+ }
+ image_type++;
+ } while (image_type->signature != NULL);
+ return ret;
+}
+
+static PuzzleImageTypeCode puzzle_get_image_type_from_fp(FILE * const fp)
+{
+ unsigned char header[MAX_SIGNATURE_LENGTH];
+ PuzzleImageTypeCode ret = PUZZLE_IMAGE_TYPE_ERROR;
+ fpos_t pos;
+
+ if (fgetpos(fp, &pos) != 0) {
+ return PUZZLE_IMAGE_TYPE_ERROR;
+ }
+ rewind(fp);
+ if (fread(header, (size_t) 1U, sizeof header, fp) != sizeof header) {
+ goto bye;
+ }
+ ret = puzzle_get_image_type_from_header(header);
+ bye:
+ if (fsetpos(fp, &pos) != 0) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ return ret;
+}
+
+static int puzzle_autocrop_axis(PuzzleContext * const context,
+ PuzzleView * const view,
+ unsigned int * const crop0,
+ unsigned int * const crop1,
+ const unsigned int axisn,
+ const unsigned int axiso,
+ const int omaptrinc, const int nmaptrinc)
+{
+ double *chunk_contrasts;
+ size_t sizeof_chunk_contrasts;
+ double chunk_contrast = 0.0, total_contrast = 0.0, barrier_contrast;
+ unsigned char level = 0U;
+ unsigned char previous_level = 0U;
+ unsigned int chunk_n, chunk_o;
+ unsigned int chunk_n1, chunk_o1;
+ unsigned int max_crop;
+ const unsigned char *maptr;
+
+ chunk_n1 = axisn - 1U;
+ chunk_o1 = axiso - 1U;
+ *crop0 = 0U;
+ *crop1 = chunk_n1;
+ if (axisn < (unsigned int) PUZZLE_MIN_SIZE_FOR_CROPPING ||
+ axiso < (unsigned int) PUZZLE_MIN_SIZE_FOR_CROPPING) {
+ return 1;
+ }
+ sizeof_chunk_contrasts = chunk_n1 + 1U;
+ if ((chunk_contrasts = calloc(sizeof_chunk_contrasts,
+ sizeof *chunk_contrasts)) == NULL) {
+ return -1;
+ }
+ maptr = view->map;
+ if (axisn >= INT_MAX || axiso >= INT_MAX) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ if (INT_MAX / axisn < axiso) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ chunk_n = chunk_n1;
+ do {
+ chunk_contrast = 0.0;
+ chunk_o = chunk_o1;
+ previous_level = *maptr;
+ do {
+ level = *maptr;
+ if (previous_level > level) {
+ chunk_contrast += (double) (previous_level - level);
+ } else {
+ chunk_contrast += (double) (level - previous_level);
+ }
+ previous_level = level;
+ maptr += omaptrinc;
+ } while (chunk_o-- != 0U);
+ chunk_contrasts[chunk_n] = chunk_contrast;
+ total_contrast += chunk_contrast;
+ maptr += nmaptrinc;
+ } while (chunk_n-- != 0U);
+ barrier_contrast =
+ total_contrast * context->puzzle_contrast_barrier_for_cropping;
+ total_contrast = 0.0;
+ *crop0 = 0U;
+ do {
+ total_contrast += chunk_contrasts[*crop0];
+ if (total_contrast >= barrier_contrast) {
+ break;
+ }
+ } while ((*crop0)++ < chunk_n1);
+ total_contrast = 0.0;
+ *crop1 = chunk_n1;
+ do {
+ total_contrast += chunk_contrasts[*crop1];
+ if (total_contrast >= barrier_contrast) {
+ break;
+ }
+ } while ((*crop1)-- > 0U);
+ free(chunk_contrasts);
+ if (*crop0 > chunk_n1 || *crop1 > chunk_n1) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ max_crop = (unsigned int)
+ round((double) chunk_n1 * context->puzzle_max_cropping_ratio);
+ if (max_crop > chunk_n1) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ *crop0 = MIN(*crop0, max_crop);
+ *crop1 = MAX(*crop1, chunk_n1 - max_crop);
+
+ return 0;
+}
+
+static int puzzle_autocrop_view(PuzzleContext * context,
+ PuzzleView * const view)
+{
+ unsigned int cropx0, cropx1;
+ unsigned int cropy0, cropy1;
+ unsigned int x, y;
+ unsigned char *maptr;
+
+ if (puzzle_autocrop_axis(context, view, &cropx0, &cropx1,
+ view->width, view->height,
+ (int) view->width,
+ 1 - (int) (view->width * view->height)) < 0 ||
+ puzzle_autocrop_axis(context, view, &cropy0, &cropy1,
+ view->height, view->width,
+ 1, 0) < 0) {
+ return -1;
+ }
+ if (cropx0 > cropx1 || cropy0 > cropy1) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ maptr = view->map;
+ y = cropy0;
+ do {
+ x = cropx0;
+ do {
+ *maptr++ = PUZZLE_VIEW_PIXEL(view, x, y);
+ } while (x++ != cropx1);
+ } while (y++ != cropy1);
+ view->width = cropx1 - cropx0 + 1U;
+ view->height = cropy1 - cropy0 + 1U;
+ view->sizeof_map = (size_t) view->width * (size_t) view->height;
+ if (view->width <= 0U || view->height <= 0U ||
+ SIZE_MAX / view->width < view->height) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ return 0;
+}
+
+static int puzzle_getview_from_gdimage(PuzzleContext * const context,
+ PuzzleView * const view,
+ gdImagePtr gdimage)
+{
+ unsigned int x, y;
+ const unsigned int x0 = 0U, y0 = 0U;
+ unsigned int x1, y1;
+ unsigned char *maptr;
+ int pixel;
+
+ view->map = NULL;
+ view->width = (unsigned int) gdImageSX(gdimage);
+ view->height = (unsigned int) gdImageSY(gdimage);
+ view->sizeof_map = (size_t) (view->width * view->height);
+ if (view->width > context->puzzle_max_width ||
+ view->height > context->puzzle_max_height) {
+ return -1;
+ }
+ if (view->sizeof_map <= (size_t) 0U ||
+ INT_MAX / view->width < view->height ||
+ SIZE_MAX / view->width < view->height ||
+ (unsigned int) view->sizeof_map != view->sizeof_map) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ x1 = view->width - 1U;
+ y1 = view->height - 1U;
+ if (view->width <= 0U || view->height <= 0U) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ if ((view->map = calloc(view->sizeof_map, sizeof *view->map)) == NULL) {
+ return -1;
+ }
+ if (x1 > INT_MAX || y1 > INT_MAX) { /* GD uses "int" for coordinates */
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ maptr = view->map;
+ x = x1;
+ if (gdImageTrueColor(gdimage) != 0) {
+ do {
+ y = y1;
+ do {
+ pixel = gdImageGetTrueColorPixel(gdimage, (int) x, (int) y);
+ *maptr++ = (unsigned char)
+ ((gdTrueColorGetRed(pixel) * 77 +
+ gdTrueColorGetGreen(pixel) * 151 +
+ gdTrueColorGetBlue(pixel) * 28 + 128) / 256);
+ } while (y-- != y0);
+ } while (x-- != x0);
+ } else {
+ do {
+ y = y1;
+ do {
+ pixel = gdImagePalettePixel(gdimage, x, y);
+ *maptr++ = (unsigned char)
+ ((gdimage->red[pixel] * 77 +
+ gdimage->green[pixel] * 151 +
+ gdimage->blue[pixel] * 28 + 128) / 256);
+ } while (y-- != y0);
+ } while (x-- != x0);
+ }
+ return 0;
+}
+
+static double puzzle_softedgedlvl(const PuzzleView * const view,
+ const unsigned int x, const unsigned int y)
+{
+ unsigned int lvl = 0U;
+ unsigned int ax, ay;
+ unsigned int count = 0U;
+ const unsigned int xlimit = x + PUZZLE_PIXEL_FUZZ_SIZE;
+ const unsigned int ylimit = y + PUZZLE_PIXEL_FUZZ_SIZE;
+ if (x >= view->width || y >= view->height || xlimit <= x || ylimit <= y) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ if (x > PUZZLE_PIXEL_FUZZ_SIZE) {
+ ax = x - PUZZLE_PIXEL_FUZZ_SIZE;
+ } else {
+ ax = 0U;
+ }
+ do {
+ if (ax >= view->width) {
+ break;
+ }
+ if (y > PUZZLE_PIXEL_FUZZ_SIZE) {
+ ay = y - PUZZLE_PIXEL_FUZZ_SIZE;
+ } else {
+ ay = 0U;
+ }
+ do {
+ if (ay >= view->height) {
+ break;
+ }
+ count++;
+ lvl += (unsigned int) PUZZLE_VIEW_PIXEL(view, ax, ay);
+ } while (ay++ < ylimit);
+ } while (ax++ < xlimit);
+ if (count <= 0U) {
+ return 0.0;
+ }
+ return (double) lvl / (double) count;
+}
+
+static double puzzle_get_avglvl(const PuzzleView * const view,
+ const unsigned int x, const unsigned int y,
+ const unsigned int width,
+ const unsigned int height)
+{
+ double lvl = 0.0;
+ const unsigned int xlimit = x + width - 1U;
+ const unsigned int ylimit = y + height - 1U;
+ unsigned int ax, ay;
+
+ if (width <= 0U || height <= 0U) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ if (xlimit < x || ylimit < y) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ ax = x;
+ do {
+ if (ax >= view->width) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ ay = y;
+ do {
+ if (ay >= view->height) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ lvl += puzzle_softedgedlvl(view, ax, ay);
+ } while (ay++ < ylimit);
+ } while (ax++ < xlimit);
+
+ return lvl / (double) (width * height);
+}
+
+static int puzzle_fill_avglgls(PuzzleContext * const context,
+ PuzzleAvgLvls * const avglvls,
+ const PuzzleView * const view,
+ const unsigned int lambdas)
+{
+ double width = (double) view->width;
+ double height = (double) view->height;
+ double xshift, yshift;
+ double x, y;
+ unsigned int p;
+ unsigned int lx, ly;
+ unsigned int xd, yd;
+ unsigned int px, py;
+ unsigned int lwidth, lheight;
+ double avglvl;
+
+ avglvls->lambdas = lambdas;
+ avglvls->sizeof_lvls = (size_t) lambdas * lambdas;
+ if (UINT_MAX / lambdas < lambdas ||
+ (unsigned int) avglvls->sizeof_lvls != avglvls->sizeof_lvls) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ if ((avglvls->lvls = calloc(avglvls->sizeof_lvls,
+ sizeof *avglvls->lvls)) == NULL) {
+ return -1;
+ }
+ xshift = (width -
+ (width * (double) lambdas / (double) SUCC(lambdas))) / 2.0;
+ yshift = (height -
+ (height * (double) lambdas / (double) SUCC(lambdas))) / 2.0;
+ p = (unsigned int) round(MIN(width, height) /
+ (SUCC(lambdas) * context->puzzle_p_ratio));
+ if (p < PUZZLE_MIN_P) {
+ p = PUZZLE_MIN_P;
+ }
+ lx = 0U;
+ do {
+ ly = 0U;
+ do {
+ x = xshift + (double) lx * PRED(width) / SUCC(lambdas);
+ y = yshift + (double) ly * PRED(height) / SUCC(lambdas);
+ lwidth = (unsigned int) round
+ (xshift + (double) SUCC(lx) * PRED(width) /
+ (double) SUCC(lambdas) - x);
+ lheight = (unsigned int) round
+ (yshift + (double) SUCC(ly) * PRED(height) /
+ (double) SUCC(lambdas) - y);
+ if (p < lwidth) {
+ xd = (unsigned int) round(x + (lwidth - p) / 2.0);
+ } else {
+ xd = (unsigned int) round(x);
+ }
+ if (p < lheight) {
+ yd = (unsigned int) round(y + (lheight - p) / 2.0);
+ } else {
+ yd = (unsigned int) round(y);
+ }
+ if (view->width - xd < p) {
+ px = 1U;
+ } else {
+ px = p;
+ }
+ if (view->height - yd < p) {
+ py = 1U;
+ } else {
+ py = p;
+ }
+ if (px > 0U && py > 0U) {
+ avglvl = puzzle_get_avglvl(view, xd, yd, px, py);
+ } else {
+ avglvl = 0.0;
+ }
+ PUZZLE_AVGLVL(avglvls, lx, ly) = avglvl;
+ } while (++ly < lambdas);
+ } while (++lx < lambdas);
+
+ return 0;
+}
+
+static unsigned int puzzle_add_neighbors(double ** const vecur,
+ const unsigned int max_neighbors,
+ const PuzzleAvgLvls * const avglvls,
+ const unsigned int lx,
+ const unsigned int ly)
+{
+ unsigned int ax, ay;
+ unsigned int xlimit, ylimit;
+ unsigned int neighbors = 0U;
+ const double ref = PUZZLE_AVGLVL(avglvls, lx, ly);
+
+ if (max_neighbors != 8U) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ if (lx >= avglvls->lambdas - 1U) {
+ xlimit = avglvls->lambdas - 1U;
+ } else {
+ xlimit = lx + 1U;
+ }
+ if (ly >= avglvls->lambdas - 1U) {
+ ylimit = avglvls->lambdas - 1U;
+ } else {
+ ylimit = ly + 1U;
+ }
+ if (lx <= 0U) {
+ ax = 0U;
+ } else {
+ ax = lx - 1U;
+ }
+ do {
+ if (ly <= 0U) {
+ ay = 0U;
+ } else {
+ ay = ly - 1U;
+ }
+ do {
+ if (ax == lx && ay == ly) {
+ continue;
+ }
+ *(*vecur)++ = ref - PUZZLE_AVGLVL(avglvls, ax, ay);
+ neighbors++;
+ if (neighbors <= 0U) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ } while (ay++ < ylimit);
+ } while (ax++ < xlimit);
+ if (neighbors > max_neighbors) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ return neighbors;
+}
+
+static int puzzle_fill_dvec(PuzzleDvec * const dvec,
+ const PuzzleAvgLvls * const avglvls)
+{
+ unsigned int lambdas;
+ unsigned int lx, ly;
+ double *vecur;
+
+ lambdas = avglvls->lambdas;
+ dvec->sizeof_compressed_vec = (size_t) 0U;
+ dvec->sizeof_vec = (size_t) (lambdas * lambdas * PUZZLE_NEIGHBORS);
+ if (SIZE_MAX /
+ ((size_t) (lambdas * lambdas)) < (size_t) PUZZLE_NEIGHBORS ||
+ (unsigned int) dvec->sizeof_vec != dvec->sizeof_vec) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ if ((dvec->vec = calloc(dvec->sizeof_vec, sizeof *dvec->vec)) == NULL) {
+ return -1;
+ }
+ vecur = dvec->vec;
+ lx = 0U;
+ do {
+ ly = 0U;
+ do {
+ (void) puzzle_add_neighbors(&vecur, PUZZLE_NEIGHBORS,
+ avglvls, lx, ly);
+ } while (++ly < lambdas);
+ } while (++lx < lambdas);
+ dvec->sizeof_compressed_vec = (size_t) (vecur - dvec->vec);
+
+ return 0;
+}
+
+static void puzzle_remove_transparency(gdImagePtr gdimage)
+{
+ int background = gdTrueColor(255, 255, 255);
+ int x, y, cpix;
+
+ gdImagePaletteToTrueColor(gdimage);
+
+ for (y = 0; y < gdImageSY(gdimage); y++) {
+ for (x = 0; x < gdImageSX(gdimage); x++) {
+ cpix = gdImageGetTrueColorPixel(gdimage, x, y);
+ gdImageSetPixel(gdimage, x, y, gdAlphaBlend(background, cpix));
+ }
+ }
+}
+
+static gdImagePtr puzzle_create_gdimage_from_file(const char * const file)
+{
+ gdImagePtr gdimage = NULL;
+ FILE *fp;
+ PuzzleImageTypeCode image_type_code;
+ if ((fp = fopen(file, "rb")) == NULL) {
+ return NULL;
+ }
+ image_type_code = puzzle_get_image_type_from_fp(fp);
+ switch (image_type_code) {
+ case PUZZLE_IMAGE_TYPE_JPEG:
+ gdimage = gdImageCreateFromJpeg(fp);
+ break;
+ case PUZZLE_IMAGE_TYPE_PNG:
+ gdimage = gdImageCreateFromPng(fp);
+ break;
+ case PUZZLE_IMAGE_TYPE_GIF:
+ gdimage = gdImageCreateFromGif(fp);
+ break;
+ default:
+ gdimage = NULL;
+ }
+ (void) fclose(fp);
+ return gdimage;
+}
+
+static gdImagePtr puzzle_create_gdimage_from_mem(const void * const mem, const size_t size)
+{
+ gdImagePtr gdimage = NULL;
+ PuzzleImageTypeCode image_type_code = puzzle_get_image_type_from_header(mem);
+ switch (image_type_code) {
+ case PUZZLE_IMAGE_TYPE_JPEG:
+ gdimage = gdImageCreateFromJpegPtr(size, (void *)mem);
+ break;
+ case PUZZLE_IMAGE_TYPE_PNG:
+ gdimage = gdImageCreateFromPngPtr(size, (void *)mem);
+ break;
+ case PUZZLE_IMAGE_TYPE_GIF:
+ gdimage = gdImageCreateFromGifPtr(size, (void *)mem);
+ break;
+ default:
+ gdimage = NULL;
+ }
+ return gdimage;
+}
+
+static int puzzle_fill_dvec_from_gdimage(PuzzleContext * const context,
+ PuzzleDvec * const dvec,
+ const gdImagePtr gdimage)
+{
+ PuzzleView view;
+ PuzzleAvgLvls avglvls;
+ int ret = 0;
+
+ if (context->magic != PUZZLE_CONTEXT_MAGIC) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ puzzle_init_view(&view);
+ puzzle_init_avglvls(&avglvls);
+ puzzle_init_dvec(context, dvec);
+ ret = puzzle_getview_from_gdimage(context, &view, gdimage);
+ if (ret != 0) {
+ goto out;
+ }
+ if (context->puzzle_enable_autocrop != 0 &&
+ (ret = puzzle_autocrop_view(context, &view)) < 0) {
+ goto out;
+ }
+ if ((ret = puzzle_fill_avglgls(context, &avglvls,
+ &view, context->puzzle_lambdas)) != 0) {
+ goto out;
+ }
+ ret = puzzle_fill_dvec(dvec, &avglvls);
+ out:
+ puzzle_free_view(&view);
+ puzzle_free_avglvls(&avglvls);
+
+ return ret;
+}
+
+int puzzle_fill_dvec_from_file(PuzzleContext * const context,
+ PuzzleDvec * const dvec,
+ const char * const file)
+{
+ int ret;
+ gdImagePtr gdimage = puzzle_create_gdimage_from_file(file);
+ if (gdimage == NULL) {
+ return -1;
+ }
+ puzzle_remove_transparency(gdimage);
+ ret = puzzle_fill_dvec_from_gdimage(context, dvec, gdimage);
+ gdImageDestroy(gdimage);
+ return ret;
+}
+
+int puzzle_fill_dvec_from_mem(PuzzleContext * const context,
+ PuzzleDvec * const dvec,
+ const void * const mem,
+ const size_t size)
+{
+ int ret;
+ gdImagePtr gdimage = puzzle_create_gdimage_from_mem(mem, size);
+ if (gdimage == NULL) {
+ return -1;
+ }
+ puzzle_remove_transparency(gdimage);
+ ret = puzzle_fill_dvec_from_gdimage(context, dvec, gdimage);
+ gdImageDestroy(gdimage);
+ return ret;
+}
+
+int puzzle_dump_dvec(PuzzleContext * const context,
+ const PuzzleDvec * const dvec)
+{
+ size_t s = dvec->sizeof_compressed_vec;
+ const double *vecptr = dvec->vec;
+
+ (void) context;
+ if (s <= (size_t) 0U) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ do {
+ printf("%g\n", *vecptr++);
+ } while (--s != (size_t) 0U);
+
+ return 0;
+}
diff --git a/deduper/libpuzzle/src/globals.h b/deduper/libpuzzle/src/globals.h
new file mode 100644
index 0000000..757c5c7
--- /dev/null
+++ b/deduper/libpuzzle/src/globals.h
@@ -0,0 +1,26 @@
+#ifndef __GLOBALS_H__
+#define __GLOBALS_H__ 1
+
+#ifdef DEFINE_GLOBALS
+# define GLOBAL0(A) A
+# define GLOBAL(A, B) A = B
+#else
+# define GLOBAL0(A) extern A
+# define GLOBAL(A, B) extern A
+#endif
+
+GLOBAL(PuzzleContext puzzle_global_context,
+{
+ /* unsigned int puzzle_max_width */ PUZZLE_DEFAULT_MAX_WIDTH _COMA_
+ /* unsigned int puzzle_max_height */ PUZZLE_DEFAULT_MAX_HEIGHT _COMA_
+ /* unsigned int puzzle_lambdas */ PUZZLE_DEFAULT_LAMBDAS _COMA_
+ /* double puzzle_p_ratio */ PUZZLE_DEFAULT_P_RATIO _COMA_
+ /* double puzzle_noise_cutoff */ PUZZLE_DEFAULT_NOISE_CUTOFF _COMA_
+ /* double puzzle_contrast_barrier_for_cropping */
+ PUZZLE_DEFAULT_CONTRAST_BARRIER_FOR_CROPPING _COMA_
+ /* double puzzle_max_cropping_ratio */
+ PUZZLE_DEFAULT_MAX_CROPPING_RATIO _COMA_
+ /* int puzzle_enable_autocrop */ PUZZLE_DEFAULT_ENABLE_AUTOCROP _COMA_
+ /* unsigned long magic */ PUZZLE_CONTEXT_MAGIC _COMA_
+});
+#endif
diff --git a/deduper/libpuzzle/src/pics/Makefile.am b/deduper/libpuzzle/src/pics/Makefile.am
new file mode 100644
index 0000000..510311f
--- /dev/null
+++ b/deduper/libpuzzle/src/pics/Makefile.am
@@ -0,0 +1,8 @@
+EXTRA_DIST = \
+ pic-a-0.jpg \
+ pic-a-1.jpg \
+ luxmarket_tshirt01.jpg \
+ luxmarket_tshirt01_black.jpg \
+ luxmarket_tshirt01_sal.jpg \
+ luxmarket_tshirt01_sheum.jpg \
+ duck.gif
diff --git a/deduper/libpuzzle/src/pics/duck.gif b/deduper/libpuzzle/src/pics/duck.gif
new file mode 100644
index 0000000..96c3037
--- /dev/null
+++ b/deduper/libpuzzle/src/pics/duck.gif
Binary files differ
diff --git a/deduper/libpuzzle/src/pics/luxmarket_tshirt01.jpg b/deduper/libpuzzle/src/pics/luxmarket_tshirt01.jpg
new file mode 100644
index 0000000..ffaf7eb
--- /dev/null
+++ b/deduper/libpuzzle/src/pics/luxmarket_tshirt01.jpg
Binary files differ
diff --git a/deduper/libpuzzle/src/pics/luxmarket_tshirt01_black.jpg b/deduper/libpuzzle/src/pics/luxmarket_tshirt01_black.jpg
new file mode 100644
index 0000000..73cac7b
--- /dev/null
+++ b/deduper/libpuzzle/src/pics/luxmarket_tshirt01_black.jpg
Binary files differ
diff --git a/deduper/libpuzzle/src/pics/luxmarket_tshirt01_sal.jpg b/deduper/libpuzzle/src/pics/luxmarket_tshirt01_sal.jpg
new file mode 100644
index 0000000..cb0cefe
--- /dev/null
+++ b/deduper/libpuzzle/src/pics/luxmarket_tshirt01_sal.jpg
Binary files differ
diff --git a/deduper/libpuzzle/src/pics/luxmarket_tshirt01_sheum.jpg b/deduper/libpuzzle/src/pics/luxmarket_tshirt01_sheum.jpg
new file mode 100644
index 0000000..185393c
--- /dev/null
+++ b/deduper/libpuzzle/src/pics/luxmarket_tshirt01_sheum.jpg
Binary files differ
diff --git a/deduper/libpuzzle/src/pics/pic-a-0.jpg b/deduper/libpuzzle/src/pics/pic-a-0.jpg
new file mode 100644
index 0000000..3dd4a3b
--- /dev/null
+++ b/deduper/libpuzzle/src/pics/pic-a-0.jpg
Binary files differ
diff --git a/deduper/libpuzzle/src/pics/pic-a-1.jpg b/deduper/libpuzzle/src/pics/pic-a-1.jpg
new file mode 100644
index 0000000..95f0e77
--- /dev/null
+++ b/deduper/libpuzzle/src/pics/pic-a-1.jpg
Binary files differ
diff --git a/deduper/libpuzzle/src/puzzle-diff.c b/deduper/libpuzzle/src/puzzle-diff.c
new file mode 100644
index 0000000..e0f3626
--- /dev/null
+++ b/deduper/libpuzzle/src/puzzle-diff.c
@@ -0,0 +1,130 @@
+#include "puzzle_common.h"
+#include "puzzle.h"
+
+typedef struct Opts_ {
+ const char *file1;
+ const char *file2;
+ int fix_for_texts;
+ int exit;
+ double similarity_threshold;
+} Opts;
+
+void usage(void)
+{
+ puts("\nUsage: puzzle-diff [-b <contrast barrier for cropping] [-c]\n"
+ " [-C <max cropping ratio>] [-e] [-E similarity threshold] [-h]\n"
+ " [-H <max height>] [-l <lambdas>] [-n <noise cutoff>]\n"
+ " [-p <p ratio>] [-t] [-W <max width>] <file 1> <file 2>\n\n"
+ "Visually compares two images and returns their distance.\n\n"
+ "-b <contrast barrier for cropping>\n"
+ "-c : disable autocrop\n"
+ "-C <max cropping ratio>\n"
+ "-e : exit with 10 (images are similar) or 20 (images are not)\n"
+ "-E <similarity threshold> : for -e\n"
+ "-h : show help\n"
+ "-H <width> : set max height\n"
+ "-l <lambdas> : change lambdas\n"
+ "-n <noise cutoff> : change noise cutoff\n"
+ "-p <ratio> : set p ratio\n"
+ "-t disable fix for texts\n"
+ "-W <width> : set max width\n"
+ "\n");
+ exit(EXIT_SUCCESS);
+}
+
+int parse_opts(Opts * const opts, PuzzleContext * context,
+ int argc, char * const *argv) {
+ int opt;
+ extern char *optarg;
+ extern int optind;
+
+ opts->fix_for_texts = 1;
+ opts->exit = 0;
+ opts->similarity_threshold = PUZZLE_CVEC_SIMILARITY_THRESHOLD;
+ while ((opt = getopt(argc, argv, "b:cC:eE:hH:l:n:p:tW:")) != -1) {
+ switch (opt) {
+ case 'b':
+ puzzle_set_contrast_barrier_for_cropping(context, atof(optarg));
+ break;
+ case 'c':
+ puzzle_set_autocrop(context, 0);
+ break;
+ case 'C':
+ puzzle_set_max_cropping_ratio(context, atof(optarg));
+ break;
+ case 'e':
+ opts->exit = 1;
+ break;
+ case 'E':
+ opts->similarity_threshold = atof(optarg);
+ break;
+ case 'h':
+ usage();
+ /* NOTREACHED */
+ case 'H':
+ puzzle_set_max_height(context, strtoul(optarg, NULL, 10));
+ break;
+ case 'l':
+ puzzle_set_lambdas(context, strtoul(optarg, NULL, 10));
+ break;
+ case 'n':
+ puzzle_set_noise_cutoff(context, atof(optarg));
+ break;
+ case 'p':
+ puzzle_set_p_ratio(context, atof(optarg));
+ break;
+ case 't':
+ opts->fix_for_texts = 0;
+ break;
+ case 'W':
+ puzzle_set_max_width(context, strtoul(optarg, NULL, 10));
+ break;
+ default:
+ usage();
+ /* NOTREACHED */
+ }
+ }
+ argc -= optind;
+ argv += optind;
+ if (argc != 2) {
+ usage();
+ }
+ opts->file1 = *argv++;
+ opts->file2 = *argv;
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ Opts opts;
+ PuzzleContext context;
+ PuzzleCvec cvec1, cvec2;
+ double d;
+
+ puzzle_init_context(&context);
+ parse_opts(&opts, &context, argc, argv);
+ puzzle_init_cvec(&context, &cvec1);
+ puzzle_init_cvec(&context, &cvec2);
+ if (puzzle_fill_cvec_from_file(&context, &cvec1, opts.file1) != 0) {
+ fprintf(stderr, "Unable to read [%s]\n", opts.file1);
+ return 1;
+ }
+ if (puzzle_fill_cvec_from_file(&context, &cvec2, opts.file2) != 0) {
+ fprintf(stderr, "Unable to read [%s]\n", opts.file2);
+ return 1;
+ }
+ d = puzzle_vector_normalized_distance(&context, &cvec1, &cvec2,
+ opts.fix_for_texts);
+ puzzle_free_cvec(&context, &cvec1);
+ puzzle_free_cvec(&context, &cvec2);
+ puzzle_free_context(&context);
+ if (opts.exit == 0) {
+ printf("%g\n", d);
+ return 0;
+ }
+ if (d > opts.similarity_threshold) {
+ return 20;
+ }
+ return 10;
+}
diff --git a/deduper/libpuzzle/src/puzzle.c b/deduper/libpuzzle/src/puzzle.c
new file mode 100644
index 0000000..e21c252
--- /dev/null
+++ b/deduper/libpuzzle/src/puzzle.c
@@ -0,0 +1,22 @@
+#define DEFINE_GLOBALS 1
+#include "puzzle_common.h"
+#include "puzzle_p.h"
+#include "puzzle.h"
+#include "globals.h"
+
+void puzzle_init_context(PuzzleContext * const context)
+{
+ *context = puzzle_global_context;
+}
+
+void puzzle_free_context(PuzzleContext * const context)
+{
+ (void) context;
+}
+
+void puzzle_err_bug(const char * const file, const int line)
+{
+ fprintf(stderr, "*BUG* File: [%s] Line: [%d]\n", file, line);
+ abort();
+}
+
diff --git a/deduper/libpuzzle/src/puzzle.h b/deduper/libpuzzle/src/puzzle.h
new file mode 100644
index 0000000..c31b43f
--- /dev/null
+++ b/deduper/libpuzzle/src/puzzle.h
@@ -0,0 +1,122 @@
+#ifndef __PUZZLE_H__
+#define __PUZZLE_H__ 1
+
+#define PUZZLE_VERSION_MAJOR 0
+#define PUZZLE_VERSION_MINOR 11
+
+#include "puzzle_common.h"
+
+typedef struct PuzzleDvec_ {
+ size_t sizeof_vec;
+ size_t sizeof_compressed_vec;
+ double *vec;
+} PuzzleDvec;
+
+typedef struct PuzzleCvec_ {
+ size_t sizeof_vec;
+ signed char *vec;
+} PuzzleCvec;
+
+typedef struct PuzzleCompressedCvec_ {
+ size_t sizeof_compressed_vec;
+ unsigned char *vec;
+} PuzzleCompressedCvec;
+
+typedef struct PuzzleContext_ {
+ unsigned int puzzle_max_width;
+ unsigned int puzzle_max_height;
+ unsigned int puzzle_lambdas;
+ double puzzle_p_ratio;
+ double puzzle_noise_cutoff;
+ double puzzle_contrast_barrier_for_cropping;
+ double puzzle_max_cropping_ratio;
+ int puzzle_enable_autocrop;
+ unsigned long magic;
+} PuzzleContext;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void puzzle_init_context(PuzzleContext * const context);
+void puzzle_free_context(PuzzleContext * const context);
+int puzzle_set_max_width(PuzzleContext * const context,
+ const unsigned int width);
+int puzzle_set_max_height(PuzzleContext * const context,
+ const unsigned int height);
+int puzzle_set_lambdas(PuzzleContext * const context,
+ const unsigned int lambdas);
+int puzzle_set_noise_cutoff(PuzzleContext * const context,
+ const double noise_cutoff);
+int puzzle_set_p_ratio(PuzzleContext * const context,
+ const double p_ratio);
+int puzzle_set_contrast_barrier_for_cropping(PuzzleContext * const context,
+ const double barrier);
+int puzzle_set_max_cropping_ratio(PuzzleContext * const context,
+ const double ratio);
+int puzzle_set_autocrop(PuzzleContext * const context,
+ const int enable);
+void puzzle_init_cvec(PuzzleContext * const context,
+ PuzzleCvec * const cvec);
+void puzzle_init_dvec(PuzzleContext * const context,
+ PuzzleDvec * const dvec);
+int puzzle_fill_dvec_from_file(PuzzleContext * const context,
+ PuzzleDvec * const dvec,
+ const char * const file);
+int puzzle_fill_cvec_from_file(PuzzleContext * const context,
+ PuzzleCvec * const cvec,
+ const char * const file);
+int puzzle_fill_dvec_from_mem(PuzzleContext * const context,
+ PuzzleDvec * const dvec,
+ const void * const mem,
+ const size_t size);
+int puzzle_fill_cvec_from_mem(PuzzleContext * const context,
+ PuzzleCvec * const cvec,
+ const void * const mem,
+ const size_t size);
+int puzzle_fill_cvec_from_dvec(PuzzleContext * const context,
+ PuzzleCvec * const cvec,
+ const PuzzleDvec * const dvec);
+void puzzle_free_cvec(PuzzleContext * const context,
+ PuzzleCvec * const cvec);
+void puzzle_free_dvec(PuzzleContext * const context,
+ PuzzleDvec * const dvec);
+int puzzle_dump_cvec(PuzzleContext * const context,
+ const PuzzleCvec * const cvec);
+int puzzle_dump_dvec(PuzzleContext * const context,
+ const PuzzleDvec * const dvec);
+int puzzle_cvec_cksum(PuzzleContext * const context,
+ const PuzzleCvec * const cvec, unsigned int * const sum);
+void puzzle_init_compressed_cvec(PuzzleContext * const context,
+ PuzzleCompressedCvec * const compressed_cvec);
+void puzzle_free_compressed_cvec(PuzzleContext * const context,
+ PuzzleCompressedCvec * const compressed_cvec);
+int puzzle_compress_cvec(PuzzleContext * const context,
+ PuzzleCompressedCvec * const compressed_cvec,
+ const PuzzleCvec * const cvec);
+int puzzle_uncompress_cvec(PuzzleContext * const context,
+ const PuzzleCompressedCvec * const compressed_cvec,
+ PuzzleCvec * const cvec);
+int puzzle_vector_sub(PuzzleContext * const context,
+ PuzzleCvec * const cvecr,
+ const PuzzleCvec * const cvec1,
+ const PuzzleCvec * const cvec2,
+ const int fix_for_texts);
+double puzzle_vector_euclidean_length(PuzzleContext * const context,
+ const PuzzleCvec * const cvec);
+double puzzle_vector_normalized_distance(PuzzleContext * const context,
+ const PuzzleCvec * const cvec1,
+ const PuzzleCvec * const cvec2,
+ const int fix_for_texts);
+
+#ifdef __cplusplus
+}
+#endif
+
+#define PUZZLE_CVEC_SIMILARITY_THRESHOLD 0.6
+#define PUZZLE_CVEC_SIMILARITY_HIGH_THRESHOLD 0.7
+#define PUZZLE_CVEC_SIMILARITY_LOW_THRESHOLD 0.3
+#define PUZZLE_CVEC_SIMILARITY_LOWER_THRESHOLD 0.2
+
+#define _COMA_ ,
+
+#endif
diff --git a/deduper/libpuzzle/src/puzzle_common.h b/deduper/libpuzzle/src/puzzle_common.h
new file mode 100644
index 0000000..ebd340b
--- /dev/null
+++ b/deduper/libpuzzle/src/puzzle_common.h
@@ -0,0 +1,18 @@
+#ifndef __PUZZLE_COMMON_H__
+#define __PUZZLE_COMMON_H__ 1
+
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+
+#ifndef errno
+extern int errno;
+#endif
+
+#endif
diff --git a/deduper/libpuzzle/src/puzzle_p.h b/deduper/libpuzzle/src/puzzle_p.h
new file mode 100644
index 0000000..2f09494
--- /dev/null
+++ b/deduper/libpuzzle/src/puzzle_p.h
@@ -0,0 +1,67 @@
+#ifndef __PUZZLE_P_H__
+#define __PUZZLE_P_H__ 1
+
+#include <math.h>
+#include <gd.h>
+
+typedef struct PuzzleView_ {
+ unsigned int width;
+ unsigned int height;
+ size_t sizeof_map;
+ unsigned char *map;
+} PuzzleView;
+
+typedef struct PuzzleAvgLvls_ {
+ unsigned int lambdas;
+ size_t sizeof_lvls;
+ double *lvls;
+} PuzzleAvgLvls;
+
+typedef enum PuzzleImageTypeCode_ {
+ PUZZLE_IMAGE_TYPE_ERROR, PUZZLE_IMAGE_TYPE_UNKNOWN, PUZZLE_IMAGE_TYPE_JPEG,
+ PUZZLE_IMAGE_TYPE_GIF, PUZZLE_IMAGE_TYPE_PNG
+} PuzzleImageTypeCode;
+
+typedef struct PuzzleImageType_ {
+ const size_t sizeof_signature;
+ const unsigned char *signature;
+ const PuzzleImageTypeCode image_type_code;
+} PuzzleImageType;
+
+#ifndef SIZE_MAX
+# define SIZE_MAX ((size_t) -1)
+#endif
+
+#define PUZZLE_DEFAULT_LAMBDAS 9
+#define PUZZLE_DEFAULT_MAX_WIDTH 3000
+#define PUZZLE_DEFAULT_MAX_HEIGHT 3000
+#define PUZZLE_DEFAULT_NOISE_CUTOFF 2.0
+#define PUZZLE_DEFAULT_P_RATIO 2.0
+#define PUZZLE_MIN_P 2
+#define PUZZLE_PIXEL_FUZZ_SIZE 1
+#define PUZZLE_NEIGHBORS 8
+#define PUZZLE_MIN_SIZE_FOR_CROPPING 100
+#if PUZZLE_MIN_SIZE_FOR_CROPPING < 4
+# error PUZZLE_MIN_SIZE_FOR_CROPPING
+#endif
+#define PUZZLE_DEFAULT_CONTRAST_BARRIER_FOR_CROPPING 0.05
+#define PUZZLE_DEFAULT_MAX_CROPPING_RATIO 0.25
+#define PUZZLE_DEFAULT_ENABLE_AUTOCROP 1
+
+#define PUZZLE_VIEW_PIXEL(V, X, Y) (*((V)->map + (V)->width * (Y) + (X)))
+#define PUZZLE_AVGLVL(A, X, Y) (*((A)->lvls + (A)->lambdas * (Y) + (X)))
+
+#define PUZZLE_CONTEXT_MAGIC 0xdeadbeef
+
+#ifndef MIN
+# define MIN(A, B) ((A) < (B) ? (A) : (B))
+#endif
+#ifndef MAX
+# define MAX(A, B) ((A) > (B) ? (A) : (B))
+#endif
+#define SUCC(A) ((A) + 1)
+#define PRED(A) ((A) - 1)
+
+void puzzle_err_bug(const char * const file, const int line);
+
+#endif
diff --git a/deduper/libpuzzle/src/regress_1.c b/deduper/libpuzzle/src/regress_1.c
new file mode 100644
index 0000000..80462b8
--- /dev/null
+++ b/deduper/libpuzzle/src/regress_1.c
@@ -0,0 +1,32 @@
+#include "puzzle_common.h"
+#include "puzzle.h"
+
+#define EXPECTED_RESULT 111444570
+
+int main(void)
+{
+ PuzzleContext context;
+ PuzzleCvec cvec;
+ PuzzleCompressedCvec compressed_cvec;
+ unsigned int sum;
+
+ puzzle_init_context(&context);
+ puzzle_init_compressed_cvec(&context, &compressed_cvec);
+ puzzle_init_cvec(&context, &cvec);
+ if (puzzle_fill_cvec_from_file(&context, &cvec,
+ "pics/luxmarket_tshirt01.jpg") != 0) {
+ fprintf(stderr, "File not found\n");
+ exit(0);
+ }
+ puzzle_compress_cvec(&context, &compressed_cvec, &cvec);
+ puzzle_free_cvec(&context, &cvec);
+ puzzle_init_cvec(&context, &cvec);
+ puzzle_uncompress_cvec(&context, &compressed_cvec, &cvec);
+ puzzle_cvec_cksum(&context, &cvec, &sum);
+ puzzle_free_cvec(&context, &cvec);
+ puzzle_free_compressed_cvec(&context, &compressed_cvec);
+ puzzle_free_context(&context);
+ printf("%u %u\n", sum, (unsigned int) EXPECTED_RESULT);
+
+ return sum != EXPECTED_RESULT;
+}
diff --git a/deduper/libpuzzle/src/regress_2.c b/deduper/libpuzzle/src/regress_2.c
new file mode 100644
index 0000000..a37b626
--- /dev/null
+++ b/deduper/libpuzzle/src/regress_2.c
@@ -0,0 +1,72 @@
+#include "puzzle_common.h"
+#include "puzzle.h"
+
+int main(void)
+{
+ PuzzleContext context;
+ PuzzleCvec cvec1, cvec2, cvec3, cvec4, cvec5, cvec6;
+ double d1, d2, d3, d4, d5, d6;
+
+ puzzle_init_context(&context);
+ puzzle_init_cvec(&context, &cvec1);
+ puzzle_init_cvec(&context, &cvec2);
+ puzzle_init_cvec(&context, &cvec3);
+ puzzle_init_cvec(&context, &cvec4);
+ puzzle_init_cvec(&context, &cvec5);
+ puzzle_init_cvec(&context, &cvec6);
+ if (puzzle_fill_cvec_from_file
+ (&context, &cvec1, "pics/luxmarket_tshirt01.jpg") != 0) {
+ fprintf(stderr, "File 1 not found\n");
+ exit(0);
+ }
+ if (puzzle_fill_cvec_from_file
+ (&context, &cvec2, "pics/luxmarket_tshirt01_black.jpg") != 0) {
+ fprintf(stderr, "File 2 not found\n");
+ exit(0);
+ }
+ if (puzzle_fill_cvec_from_file
+ (&context, &cvec3, "pics/luxmarket_tshirt01_sal.jpg") != 0) {
+ fprintf(stderr, "File 3 not found\n");
+ exit(0);
+ }
+ if (puzzle_fill_cvec_from_file
+ (&context, &cvec4, "pics/luxmarket_tshirt01_sheum.jpg") != 0) {
+ fprintf(stderr, "File 4 not found\n");
+ exit(0);
+ }
+ if (puzzle_fill_cvec_from_file
+ (&context, &cvec5, "pics/duck.gif") != 0) {
+ fprintf(stderr, "File 5 not found\n");
+ exit(0);
+ }
+ if (puzzle_fill_cvec_from_file
+ (&context, &cvec6, "pics/pic-a-0.jpg") != 0) {
+ fprintf(stderr, "File 6 not found\n");
+ exit(0);
+ }
+ d1 = puzzle_vector_normalized_distance(&context, &cvec2, &cvec1, 1);
+ d2 = puzzle_vector_normalized_distance(&context, &cvec1, &cvec2, 1);
+ d3 = puzzle_vector_normalized_distance(&context, &cvec1, &cvec3, 1);
+ d4 = puzzle_vector_normalized_distance(&context, &cvec1, &cvec4, 1);
+ d5 = puzzle_vector_normalized_distance(&context, &cvec1, &cvec5, 1);
+ d6 = puzzle_vector_normalized_distance(&context, &cvec1, &cvec6, 1);
+ printf("%g %g %g %g %g %g\n", d1, d2, d3, d4, d5, d6);
+ puzzle_free_cvec(&context, &cvec1);
+ puzzle_free_cvec(&context, &cvec2);
+ puzzle_free_cvec(&context, &cvec3);
+ puzzle_free_cvec(&context, &cvec4);
+ puzzle_free_cvec(&context, &cvec5);
+ puzzle_free_cvec(&context, &cvec6);
+ puzzle_free_context(&context);
+ if ((int) (d1 * 100.0) != (int) (d2 * 100.0)) {
+ return 1;
+ }
+ if (d1 > PUZZLE_CVEC_SIMILARITY_THRESHOLD ||
+ d3 > PUZZLE_CVEC_SIMILARITY_THRESHOLD ||
+ d4 > PUZZLE_CVEC_SIMILARITY_THRESHOLD ||
+ d5 < PUZZLE_CVEC_SIMILARITY_THRESHOLD ||
+ d6 < PUZZLE_CVEC_SIMILARITY_THRESHOLD) {
+ return 2;
+ }
+ return 0;
+}
diff --git a/deduper/libpuzzle/src/regress_3.c b/deduper/libpuzzle/src/regress_3.c
new file mode 100644
index 0000000..33698ba
--- /dev/null
+++ b/deduper/libpuzzle/src/regress_3.c
@@ -0,0 +1,35 @@
+#include "puzzle_common.h"
+#include "puzzle.h"
+
+#define PUZZLE_VECTOR_SLICE 0.6
+
+int main(void)
+{
+ PuzzleContext context;
+ PuzzleCvec cvec1, cvec2;
+ double d1, d2;
+
+ puzzle_init_context(&context);
+ puzzle_init_cvec(&context, &cvec1);
+ puzzle_init_cvec(&context, &cvec2);
+ if (puzzle_fill_cvec_from_file(&context, &cvec1,
+ "pics/pic-a-0.jpg") != 0) {
+ fprintf(stderr, "File 1 not found\n");
+ exit(0);
+ }
+ if (puzzle_fill_cvec_from_file(&context, &cvec2,
+ "pics/pic-a-1.jpg") != 0) {
+ fprintf(stderr, "File 2 not found\n");
+ exit(0);
+ }
+ d1 = puzzle_vector_normalized_distance(&context, &cvec1, &cvec2, 1);
+ d2 = puzzle_vector_normalized_distance(&context, &cvec1, &cvec2, 0);
+ printf("%g %g\n", d1, d2);
+ puzzle_free_cvec(&context, &cvec1);
+ puzzle_free_cvec(&context, &cvec2);
+ puzzle_free_context(&context);
+ if (d1 > PUZZLE_VECTOR_SLICE || d2 > PUZZLE_VECTOR_SLICE) {
+ return 2;
+ }
+ return 0;
+}
diff --git a/deduper/libpuzzle/src/tunables.c b/deduper/libpuzzle/src/tunables.c
new file mode 100644
index 0000000..280dfb2
--- /dev/null
+++ b/deduper/libpuzzle/src/tunables.c
@@ -0,0 +1,84 @@
+#include "puzzle_common.h"
+#include "puzzle_p.h"
+#include "puzzle.h"
+#include "globals.h"
+
+int puzzle_set_max_width(PuzzleContext * const context,
+ const unsigned int width)
+{
+ if (width <= 0U) {
+ return -1;
+ }
+ context->puzzle_max_width = width;
+
+ return 0;
+}
+
+int puzzle_set_max_height(PuzzleContext * const context,
+ const unsigned int height)
+{
+ if (height <= 0U) {
+ return -1;
+ }
+ context->puzzle_max_height = height;
+
+ return 0;
+}
+
+int puzzle_set_lambdas(PuzzleContext * const context,
+ const unsigned int lambdas)
+{
+ if (lambdas <= 0U) {
+ return -1;
+ }
+ context->puzzle_lambdas = lambdas;
+
+ return 0;
+}
+
+int puzzle_set_p_ratio(PuzzleContext * const context, const double p_ratio)
+{
+ if (p_ratio < 1.0) {
+ return -1;
+ }
+ context->puzzle_p_ratio = p_ratio;
+
+ return 0;
+}
+
+int puzzle_set_noise_cutoff(PuzzleContext * const context,
+ const double noise_cutoff)
+{
+ context->puzzle_noise_cutoff = noise_cutoff;
+
+ return 0;
+}
+
+int puzzle_set_contrast_barrier_for_cropping(PuzzleContext * const context,
+ const double barrier)
+{
+ if (barrier <= 0.0) {
+ return -1;
+ }
+ context->puzzle_contrast_barrier_for_cropping = barrier;
+
+ return 0;
+}
+
+int puzzle_set_max_cropping_ratio(PuzzleContext * const context,
+ const double ratio)
+{
+ if (ratio <= 0.0) {
+ return -1;
+ }
+ context->puzzle_max_cropping_ratio = ratio;
+
+ return 0;
+}
+
+int puzzle_set_autocrop(PuzzleContext * const context, const int enable)
+{
+ context->puzzle_enable_autocrop = (enable != 0);
+
+ return 0;
+}
diff --git a/deduper/libpuzzle/src/vector_ops.c b/deduper/libpuzzle/src/vector_ops.c
new file mode 100644
index 0000000..4fad5bf
--- /dev/null
+++ b/deduper/libpuzzle/src/vector_ops.c
@@ -0,0 +1,95 @@
+#include "puzzle_common.h"
+#include "puzzle_p.h"
+#include "puzzle.h"
+#include "globals.h"
+
+int puzzle_vector_sub(PuzzleContext * const context,
+ PuzzleCvec * const cvecr,
+ const PuzzleCvec * const cvec1,
+ const PuzzleCvec * const cvec2,
+ const int fix_for_texts)
+{
+ size_t remaining;
+ signed char c1, c2, cr;
+
+ (void) context;
+ if (cvec1->sizeof_vec != cvec2->sizeof_vec ||
+ cvec1->sizeof_vec <= (size_t) 0U) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ if (cvecr->vec != NULL) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ cvecr->sizeof_vec = cvec1->sizeof_vec;
+ if ((cvecr->vec = calloc(cvecr->sizeof_vec, sizeof *cvecr->vec)) == NULL) {
+ return -1;
+ }
+ remaining = cvec1->sizeof_vec;
+ if (fix_for_texts != 0) {
+ do {
+ remaining--;
+ c1 = cvec1->vec[remaining];
+ c2 = cvec2->vec[remaining];
+ if ((c1 == 0 && c2 == -2) || (c1 == -2 && c2 == 0)) {
+ cr = -3;
+ } else if ((c1 == 0 && c2 == +2) || (c1 == +2 && c2 == 0)) {
+ cr = +3;
+ } else {
+ cr = c1 - c2;
+ }
+ cvecr->vec[remaining] = cr;
+ } while (remaining > (size_t) 0U);
+ } else {
+ do {
+ remaining--;
+ cvecr->vec[remaining] =
+ cvec1->vec[remaining] - cvec2->vec[remaining];
+ } while (remaining > (size_t) 0U);
+ }
+ return 0;
+}
+
+double puzzle_vector_euclidean_length(PuzzleContext * const context,
+ const PuzzleCvec * const cvec)
+{
+ unsigned long t = 0U;
+ unsigned long c;
+ int c2;
+ size_t remaining;
+
+ (void) context;
+ if ((remaining = cvec->sizeof_vec) <= (size_t) 0U) {
+ return 0.0;
+ }
+ do {
+ remaining--;
+ c2 = (int) cvec->vec[remaining];
+ c = (unsigned long) (c2 * c2);
+ if (ULONG_MAX - t < c) {
+ puzzle_err_bug(__FILE__, __LINE__);
+ }
+ t += c;
+ } while (remaining > (size_t) 0U);
+
+ return sqrt((double) t);
+}
+
+double puzzle_vector_normalized_distance(PuzzleContext * const context,
+ const PuzzleCvec * const cvec1,
+ const PuzzleCvec * const cvec2,
+ const int fix_for_texts)
+{
+ PuzzleCvec cvecr;
+ double dt, dr;
+
+ puzzle_init_cvec(context, &cvecr);
+ puzzle_vector_sub(context, &cvecr, cvec1, cvec2, fix_for_texts);
+ dt = puzzle_vector_euclidean_length(context, &cvecr);
+ puzzle_free_cvec(context, &cvecr);
+ dr = puzzle_vector_euclidean_length(context, cvec1)
+ + puzzle_vector_euclidean_length(context, cvec2);
+ if (dr == 0.0) {
+ return 0.0;
+ }
+ return dt / dr;
+}
diff --git a/deduper/thread_pool.h b/deduper/thread_pool.h
new file mode 100644
index 0000000..ee661ce
--- /dev/null
+++ b/deduper/thread_pool.h
@@ -0,0 +1,127 @@
+#ifndef THREAD_POOL_H
+#define THREAD_POOL_H
+
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <thread>
+
+template<typename T>
+class _atomic_queue
+{
+public:
+ void push(T&v)
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ q.push(v);
+ }
+ bool pop(T&v)
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ if(!q.empty())
+ {
+ v=std::move(q.front());
+ q.pop();
+ return true;
+ }
+ return false;
+ }
+ size_t size()
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ return q.size();
+ }
+private:
+ std::queue<T> q;
+ std::mutex mtx;
+};
+
+class thread_pool
+{
+public:
+ thread_pool(size_t njobs):waiting_threads(0),stop(false),wait_interrupt(false)
+ {
+ thr.resize(njobs);
+ thstop.resize(njobs);
+ for(size_t i=0;i<njobs;++i)
+ {
+ auto cstop=thstop[i]=std::make_shared<std::atomic<bool>>(false);
+ auto looper=[this,i,cstop]{
+ std::atomic<bool>&stop=*cstop;
+ std::function<void(int)> *f;
+ bool popped=wq.pop(f);
+ while(1)
+ {
+ for(;popped;popped=wq.pop(f))
+ {
+ std::unique_ptr<std::function<void(int)>> pf(f);
+ (*f)(i);
+ if(stop)return;
+ }
+ std::unique_lock<std::mutex> lck(mtx);
+ ++waiting_threads;
+ cv.wait(lck,[this,&f,&popped,&stop]{
+ popped=wq.pop(f);
+ return popped||wait_interrupt||stop;
+ });
+ --waiting_threads;
+ if(!popped)return;
+ }
+ };
+ thr[i].reset(new std::thread(looper));
+ }
+ }
+ template<typename F,typename...A>
+ auto create_task(F&&f,A&&...args)->std::future<decltype(f(0,args...))>
+ {
+ auto task=std::make_shared<std::packaged_task<decltype(f(0,args...))(int)>>(
+ std::bind(std::forward<F>(f),std::placeholders::_1,std::forward<A>(args)...)
+ );
+ auto worktask=new std::function<void(int)>([task](int id){(*task)(id);});
+ wq.push(worktask);
+ std::unique_lock<std::mutex> lck(mtx);
+ cv.notify_one();
+ return task->get_future();
+ }
+ void wait()
+ {
+ if(!stop)wait_interrupt=true;
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ cv.notify_all();
+ }
+ for(size_t i=0;i<thr.size();++i)if(thr[i]->joinable())thr[i]->join();
+ std::function<void(int)> *f;
+ while(wq.size()){wq.pop(f);delete f;}
+ thr.clear();thstop.clear();
+ }
+ void terminate()
+ {
+ stop=true;
+ std::function<void(int)> *f;
+ while(wq.size()){wq.pop(f);delete f;}
+ for(size_t i=0;i<thstop.size();++i)*thstop[i]=true;
+ {
+ std::unique_lock<std::mutex> lck(mtx);
+ cv.notify_all();
+ }
+ for(size_t i=0;i<thr.size();++i)if(thr[i]->joinable())thr[i]->join();
+ while(wq.size()){wq.pop(f);delete f;}
+ thr.clear();thstop.clear();
+ }
+private:
+ std::vector<std::unique_ptr<std::thread>> thr;
+ std::vector<std::shared_ptr<std::atomic<bool>>> thstop;
+ _atomic_queue<std::function<void(int)>*> wq;
+ std::atomic<bool> wait_interrupt;
+ std::atomic<bool> stop;
+ std::atomic<int> waiting_threads;
+ std::mutex mtx;
+ std::condition_variable cv;
+};
+
+#endif