diff options
author | Chris Xiong <chirs241097@gmail.com> | 2020-04-06 00:50:58 +0800 |
---|---|---|
committer | Chris Xiong <chirs241097@gmail.com> | 2020-04-06 00:50:58 +0800 |
commit | ed47c1557915bb2472f6959e723cd76155312a98 (patch) | |
tree | 85bc451630ebaa4f5ffce3043b4cbf948a912a66 /deduper/libpuzzle/man | |
parent | 0a094f28c2e2ebfaac91398ae62e40f00f09221b (diff) | |
download | oddities-ed47c1557915bb2472f6959e723cd76155312a98.tar.xz |
Add deduper (unfinished tool for finding image duplicates).
Diffstat (limited to 'deduper/libpuzzle/man')
-rw-r--r-- | deduper/libpuzzle/man/Makefile.am | 7 | ||||
-rw-r--r-- | deduper/libpuzzle/man/libpuzzle.3 | 296 | ||||
-rw-r--r-- | deduper/libpuzzle/man/puzzle-diff.8 | 58 | ||||
-rw-r--r-- | deduper/libpuzzle/man/puzzle_set.3 | 129 |
4 files changed, 490 insertions, 0 deletions
diff --git a/deduper/libpuzzle/man/Makefile.am b/deduper/libpuzzle/man/Makefile.am new file mode 100644 index 0000000..a3a78a5 --- /dev/null +++ b/deduper/libpuzzle/man/Makefile.am @@ -0,0 +1,7 @@ +man_MANS = \ + libpuzzle.3 \ + puzzle_set.3 \ + puzzle-diff.8 + +EXTRA_DIST = \ + $(man_MANS) diff --git a/deduper/libpuzzle/man/libpuzzle.3 b/deduper/libpuzzle/man/libpuzzle.3 new file mode 100644 index 0000000..98cfcbb --- /dev/null +++ b/deduper/libpuzzle/man/libpuzzle.3 @@ -0,0 +1,296 @@ +.\" +.\" Copyright (c) 2007-2014 Frank DENIS <j at pureftpd.org> +.\" +.\" Permission to use, copy, modify, and distribute this software for any +.\" purpose with or without fee is hereby granted, provided that the above +.\" copyright notice and this permission notice appear in all copies. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +.\" +.Dd $Mdocdate: March 31 2011 $ +.Dt LIBPUZZLE 3 +.Sh NAME +.Nm puzzle_init_cvec , +.Nm puzzle_init_dvec , +.Nm puzzle_fill_dvec_from_file , +.Nm puzzle_fill_cvec_from_file , +.Nm puzzle_fill_dvec_from_mem , +.Nm puzzle_fill_cvec_from_mem , +.Nm puzzle_fill_cvec_from_dvec , +.Nm puzzle_free_cvec , +.Nm puzzle_free_dvec , +.Nm puzzle_init_compressed_cvec , +.Nm puzzle_free_compressed_cvec , +.Nm puzzle_compress_cvec , +.Nm puzzle_uncompress_cvec , +.Nm puzzle_vector_normalized_distance +.Nd compute comparable signatures of bitmap images. +.Sh SYNOPSIS +.Fd #include <puzzle.h> +.Ft void +.Fn puzzle_init_context "PuzzleContext *context" +.Ft void +.Fn puzzle_free_context "PuzzleContext *context" +.Ft void +.Fn puzzle_init_cvec "PuzzleContext *context" "PuzzleCvec *cvec" +.Ft void +.Fn puzzle_init_dvec "PuzzleContext *context" "PuzzleDvec *dvec" +.Ft int +.Fn puzzle_fill_dvec_from_file "PuzzleContext *context" "PuzzleDvec * dvec" "const char *file" +.Ft int +.Fn puzzle_fill_cvec_from_file "PuzzleContext *context" "PuzzleCvec * cvec" "const char *file" +.Ft int +.Fn puzzle_fill_dvec_from_mem "PuzzleContext *context" "PuzzleDvec * dvec" "const void *mem" "size_t size" +.Ft int +.Fn puzzle_fill_cvec_from_mem "PuzzleContext *context" "PuzzleCvec * cvec" "const void *mem" "size_t size" +.Ft int +.Fn puzzle_fill_cvec_from_dvec "PuzzleContext *context" "PuzzleCvec * cvec" "const PuzzleDvec *dvec" +.Ft void +.Fn puzzle_free_cvec "PuzzleContext *context" "PuzzleCvec *cvec" +.Ft void +.Fn puzzle_free_dvec "PuzzleContext *context" "PuzzleDvec *dvec" +.Ft void +.Fn puzzle_init_compressed_cvec "PuzzleContext *context" "PuzzleCompressedCvec * compressed_cvec" +.Ft void +.Fn puzzle_free_compressed_cvec "PuzzleContext *context" "PuzzleCompressedCvec * compressed_cvec" +.Ft int +.Fn puzzle_compress_cvec "PuzzleContext *context" "PuzzleCompressedCvec * compressed_cvec" "const PuzzleCvec * cvec" +.Ft int +.Fn puzzle_uncompress_cvec "PuzzleContext *context" "PuzzleCompressedCvec * compressed_cvec" "PuzzleCvec * const cvec" +.Ft double +.Fn puzzle_vector_normalized_distance "PuzzleContext *context" "const PuzzleCvec * cvec1" "const PuzzleCvec * cvec2" "int fix_for_texts" +.Sh DESCRIPTION +The Puzzle library computes a signature out of a bitmap picture. +Signatures are comparable and similar pictures have similar signatures. +.Pp +After a picture has been loaded and uncompressed, featureless parts of +the image are skipped (autocrop), unless that step has been explicitely +disabled, see +.Xr puzzle_set 3 +.Sh LIBPUZZLE CONTEXT +Every public function requires a +.Va PuzzleContext +object, that stores every required tunables. +.Pp +Any application using libpuzzle should initialize a +.Va PuzzleContext +object with +.Fn puzzle_init_context +and free it after use with +.Fn puzzle_free_context +.Bd \-literal \-offset indent +PuzzleContext context; + +puzzle_init_context(&context); + ... +puzzle_free_context(&context); +.Ed +.Sh DVEC AND CVEC VECTORS +The next step is to divide the cropped image into a grid and to compute +the average intensity of soft\(hyedged pixels in every block. The result is a +.Va PuzzleDvec +object. +.Pp +.Va PuzzleDvec +objects should be initialized before use, with +.Fn puzzle_init_dvec +and freed after use with +.Fn puzzle_free_dvec +.Pp +The +.Va PuzzleDvec +structure has two important fields: +.Va vec +is the pointer to the first element of the array containing the average +intensities, and +.Va sizeof_compressed_vec +is the number of elements. +.Pp +.Va PuzzleDvec +objects are not comparable, so what you usually want is to transform these +objects into +.Va PuzzleCvec +objects. +.Pp +A +.Va PuzzleCvec +object is a vector with relationships between adjacent blocks from a +.Va PuzzleDvec +object. +.Pp +The +.Fn puzzle_fill_cvec_from_dvec +fills a +.Va PuzzleCvec +object from a +.Va PuzzleDvec +object. +.Pp +But just like the other structure, +.Va PuzzleCvec +objects must be initialized and freed with +.Fn puzzle_init_cvec +and +.Fn puzzle_free_cvec +.Pp +.Va PuzzleCvec +objects have a vector whoose first element is in the +.Va vec +field, and the number of elements is in the +.Va sizeof_vec +field +.Sh LOADING PICTURES +.Va PuzzleDvec +and +.Va PuzzleCvec +objects can be computed from a bitmap picture file, with +.Fn puzzle_fill_dvec_from_file +and +.Fn puzzle_fill_cvec_from_file +.Pp +.Em GIF +, +.Em PNG +and +.Em JPEG +files formats are currently supported and automatically recognized. +.Pp +Here's a simple example that creates a +.Va PuzzleCvec +objects out of a file. +.Bd \-literal \-offset indent +PuzzleContext context; +PuzzleCvec cvec; + +puzzle_init_context(&context); +puzzle_init_cvec(&context, &cvec); +puzzle_fill_cvec_from_file(&context, &cvec, "test\-picture.jpg"); + ... +puzzle_free_cvec(&context, &cvec); +puzzle_free_context(&context); +.Ed +.Sh COMPARING VECTORS +In order to check whether two pictures are similar, you need to compare their +.Va PuzzleCvec +signatures, using +.Fn puzzle_vector_normalized_distance +.Pp +That function returns a distance, between 0.0 and 1.0. The lesser, the nearer. +.Pp +Tests on common pictures show that a normalized distance of 0.6 (also defined as +.Va PUZZLE_CVEC_SIMILARITY_THRESHOLD +) means that both pictures are visually similar. +.Pp +If that threshold is not right for your set of pictures, you can experiment +with +.Va PUZZLE_CVEC_SIMILARITY_HIGH_THRESHOLD +, +.Va PUZZLE_CVEC_SIMILARITY_LOW_THRESHOLD +and +.Va PUZZLE_CVEC_SIMILARITY_LOWER_THRESHOLD +or with your own value. +.Pp +If the +.Fa fix_for_texts +of +.Fn puzzle_vector_normalized_distance +is +.Em 1 +, a fix is applied to the computation in order to deal with bitmap pictures +that contain text. That fix is recommended, as it allows using the same +threshold for that kind of picture as for generic pictures. +.Pp +If +.Fa fix_for_texts +is +.Em 0 +, that special way of computing the normalized distance is disabled. +.Bd \-literal \-offset indent +PuzzleContext context; +PuzzleCvec cvec1, cvec2; +double d; + +puzzle_init_context(&context); +puzzle_init_cvec(&context, &cvec1); +puzzle_init_cvec(&context, &cvec2); +puzzle_fill_cvec_from_file(&context, &cvec1, "test\-picture\-1.jpg"); +puzzle_fill_cvec_from_file(&context, &cvec2, "test\-picture\-2.jpg"); +d = puzzle_vector_normalized_distance(&context, &cvec1, &cvec2, 1); +if (d < PUZZLE_CVEC_SIMILARITY_THRESHOLD) { + puts("Pictures are similar"); +} +puzzle_free_cvec(&context, &cvec2); +puzzle_free_cvec(&context, &cvec1); +puzzle_free_context(&context); +.Ed +.Sh CVEC COMPRESSION +In order to reduce storage needs, +.Va PuzzleCvec +objects can be compressed to 1/3 of their original size. +.Pp +.Va PuzzleCompressedCvec +structures hold the compressed data. Before and after use, these structures +have to be passed to +.Fn puzzle_init_compressed_cvec +and +.Fn puzzle_free_compressed_cvec +.Pp +.Fn puzzle_compress_cvec +compresses a +.Va PuzzleCvec +object into a +.Va PuzzleCompressedCvec +object. +.Pp +And +.Fn puzzle_uncompress_cvec +uncompresses a +.Va PuzzleCompressedCvec +object into a +.Va PuzzleCvec +object. +.Bd \-literal \-offset indent +PuzzleContext context; +PuzzleCvec cvec; +PuzzleCompressedCvec c_cvec; + ... +puzzle_init_compressed_cvec(&context, &c_cvec); +puzzle_compress_cvec(&context, &c_cvec, &cvec); + ... +puzzle_free_compressed_cvec(&context, &c_cvec); +.Ed +The +.Va PuzzleCompressedCvec +structure has two important fields: +.Va vec +that is a pointer to the first element of the compressed data, and +.Va sizeof_compressed_vec +that contains the number of elements. +.Sh RETURN VALUE +Functions return +.Em 0 +on success, and +.Em \-1 +if something went wrong. +.Sh AUTHORS +.Nf +Frank DENIS +libpuzzle at pureftpd dot org +.Fi +.Sh ACKNOWLEDGMENTS +.Nf +Xerox Research Center +H. CHI WONG +Marschall BERN +David GOLDBERG +Sameh SCHAFIK +.Fi +.Sh SEE ALSO +.Xr puzzle_set 3 +.Xr puzzle\-diff 8 diff --git a/deduper/libpuzzle/man/puzzle-diff.8 b/deduper/libpuzzle/man/puzzle-diff.8 new file mode 100644 index 0000000..5744b5a --- /dev/null +++ b/deduper/libpuzzle/man/puzzle-diff.8 @@ -0,0 +1,58 @@ +.\" +.\" Copyright (c) 2007-2014 Frank DENIS <j at pureftpd.org> +.\" +.\" Permission to use, copy, modify, and distribute this software for any +.\" purpose with or without fee is hereby granted, provided that the above +.\" copyright notice and this permission notice appear in all copies. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +.\" +.Dd $Mdocdate: September 23 2007 $ +.Dt PUZZLE-DIFF 1 +.Os +.Sh NAME +.Nm puzzle\-diff +.Nd compare pictures with libpuzzle +.Sh SYNOPSIS +.Nm puzzle\-diff +[\-b <contrast barrier for cropping] [\-c] [\-C <max cropping ratio>] +[\-e] [\-E <similarity threshold>] [\-h] [\-H <max height>] [\-l <lambdas>] +[\-n <noise cutoff>] [\-p <p ratio>] [\-t] [\-W <max width>] +<file 1> +<file 2> +.Sh DESCRIPTION +puzzle\-diff compares two pictures and outputs the normalized distance. +.Pp +Try +.Em puzzle\-diff \-h +for more info. +.Sh EXAMPLES +Output distance between two images: +.Bd -literal -offset indent +$ puzzle\-diff pic\-a\-0.jpg pics\-a\-1.jpg +0.102286 +.Ed +.Pp +Compare two images, exit with 10 if they look the same, exit with 20 if +they don't (may be useful for scripts): +.Bd -literal -offset indent +$ puzzle\-diff \-e pic\-a\-0.jpg pics\-a\-1.jpg +$ echo $? +10 +.Ed +.Pp +Compute distance, without cropping and with computing the average intensity +of the whole blocks: +.Bd -literal -offset indent +$ puzzle\-diff \-p 1.0 \-c pic\-a\-0.jpg pic\-a\-1.jpg +0.0523151 +.Ed +.Sh SEE ALSO +.Xr libpuzzle 3 +.Xr puzzle_set 3 diff --git a/deduper/libpuzzle/man/puzzle_set.3 b/deduper/libpuzzle/man/puzzle_set.3 new file mode 100644 index 0000000..a8d017b --- /dev/null +++ b/deduper/libpuzzle/man/puzzle_set.3 @@ -0,0 +1,129 @@ +.\" +.\" Copyright (c) 2007-2014 Frank DENIS <j at pureftpd.org> +.\" +.\" Permission to use, copy, modify, and distribute this software for any +.\" purpose with or without fee is hereby granted, provided that the above +.\" copyright notice and this permission notice appear in all copies. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +.\" +.Dd $Mdocdate: September 24 2007 $ +.Dt PUZZLE_SET 3 +.Sh NAME +.Nm puzzle_set_max_width , +.Nm puzzle_set_max_height , +.Nm puzzle_set_lambdas , +.Nm puzzle_set_p_ratio , +.Nm puzzle_set_noise_cutoff , +.Nm puzzle_set_contrast_barrier_for_cropping , +.Nm puzzle_set_max_cropping_ratio , +.Nm puzzle_set_autocrop +.Nd set tunables for libpuzzle functions. +.Sh SYNOPSIS +.Fd #include <puzzle.h> +.Ft int +.Fn puzzle_set_max_width "PuzzleContext *context" "unsigned int width" +.Ft int +.Fn puzzle_set_max_height "PuzzleContext *context" "unsigned int height" +.Ft int +.Fn puzzle_set_lambdas "PuzzleContext *context" "unsigned int lambdas" +.Ft int +.Fn puzzle_set_p_ratio "PuzzleContext *context" "double p_ratio" +.Ft int +.Fn puzzle_set_noise_cutoff "PuzzleContext *context" "double noise_cutoff" +.Ft int +.Fn puzzle_set_contrast_barrier_for_cropping "PuzzleContext *context" "double barrier" +.Ft int +.Fn puzzle_set_max_cropping_ratio "PuzzleContext *context" "double ratio" +.Ft int +.Fn puzzle_set_autocrop "PuzzleContext *context" "int enable" +.Sh DESCRIPTION +While default values have been chosen to be ok for most people, the +.Fn puzzle_set_* +functions are knobs to fit the algorithm to your set of data and to your +applications. +.Sh LAMBDAS +By default, pictures are divided in 9 x 9 blocks. +.Pp +.Em 9 +is the +.Em lambdas +value, and it can be changed with +.Fn puzzle_set_lambdas +.Pp +For large databases, for complex images, for images with a lot of text or +for sets of near\(hysimilar images, it might be better to raise that value to +.Em 11 +or even +.Em 13 +.Pp +However, raising that value obviously means that vectors will require more +storage space. +.Pp +The +.Em lambdas +value should remain the same in order to get comparable vectors. So if you +pick +.Em 11 +(for instance), you should always use that value for all pictures you will +compute a digest for. +.Fn puzzle_set_p_ratio +.Pp +The average intensity of each block is based upon a small centered zone. +.Pp +The "p ratio" determines the size of that zone. The default is 2.0, and that +ratio mimics the behavior that is described in the reference algorithm. +.Pp +For very specific cases (complex images) or if you get too many false +positives, as an alternative to increasing lambdas, you can try to lower that +value, for instance to 1.5. +.Pp +The lowest acceptable value is 1.0. +.Sh MAXIMUM SIZES +In order to avoid CPU starvation, pictures won't be processed if their width +or height is larger than 3000 pixels. +.Pp +These limits are rather large, but if you ever need to change them, the +.Fn puzzle_set_max_width +and +.Fn puzzle_set_max_height +are available. +.Sh NOISE CUTOFF +The noise cutoff defaults to 2. If you raise that value, more zones with +little difference of intensity will be considered as similar. +.Pp +Unless you have very specialized sets of pictures, you probably don't want +to change this. +.Sh AUTOCROP +By default, featureless borders of the original image are ignored. The size +of each border depends on the sum of absolute values of differences between +adjacent pixels, relative to the total sum. +.Pp +That feature can be disabled with +.Fn puzzle_set_autocrop "0" +Any other value will enable it. +.Pp +.Fn puzzle_set_contrast_barrier_for_cropping +changes the tolerance. The default value is 5. Less shaves less, more shaves +more. +.Pp +.Fn puzzle_set_max_cropping_ratio +This is a safe\(hyguard against unwanted excessive auto\(hycropping. +.Pp +The default (0.25) means that no more than 25% of the total width (or +height) will ever be shaved. +.Sh RETURN VALUE +Functions return +.Em 0 +on success, and +.Em \-1 +if something went wrong. +.Sh SEE ALSO +.Xr libpuzzle 3 +.Xr puzzle\-diff 8 |