aboutsummaryrefslogtreecommitdiff
path: root/deduper/libpuzzle/man
diff options
context:
space:
mode:
Diffstat (limited to 'deduper/libpuzzle/man')
-rw-r--r--deduper/libpuzzle/man/Makefile.am7
-rw-r--r--deduper/libpuzzle/man/libpuzzle.3296
-rw-r--r--deduper/libpuzzle/man/puzzle-diff.858
-rw-r--r--deduper/libpuzzle/man/puzzle_set.3129
4 files changed, 490 insertions, 0 deletions
diff --git a/deduper/libpuzzle/man/Makefile.am b/deduper/libpuzzle/man/Makefile.am
new file mode 100644
index 0000000..a3a78a5
--- /dev/null
+++ b/deduper/libpuzzle/man/Makefile.am
@@ -0,0 +1,7 @@
+man_MANS = \
+ libpuzzle.3 \
+ puzzle_set.3 \
+ puzzle-diff.8
+
+EXTRA_DIST = \
+ $(man_MANS)
diff --git a/deduper/libpuzzle/man/libpuzzle.3 b/deduper/libpuzzle/man/libpuzzle.3
new file mode 100644
index 0000000..98cfcbb
--- /dev/null
+++ b/deduper/libpuzzle/man/libpuzzle.3
@@ -0,0 +1,296 @@
+.\"
+.\" Copyright (c) 2007-2014 Frank DENIS <j at pureftpd.org>
+.\"
+.\" Permission to use, copy, modify, and distribute this software for any
+.\" purpose with or without fee is hereby granted, provided that the above
+.\" copyright notice and this permission notice appear in all copies.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+.\"
+.Dd $Mdocdate: March 31 2011 $
+.Dt LIBPUZZLE 3
+.Sh NAME
+.Nm puzzle_init_cvec ,
+.Nm puzzle_init_dvec ,
+.Nm puzzle_fill_dvec_from_file ,
+.Nm puzzle_fill_cvec_from_file ,
+.Nm puzzle_fill_dvec_from_mem ,
+.Nm puzzle_fill_cvec_from_mem ,
+.Nm puzzle_fill_cvec_from_dvec ,
+.Nm puzzle_free_cvec ,
+.Nm puzzle_free_dvec ,
+.Nm puzzle_init_compressed_cvec ,
+.Nm puzzle_free_compressed_cvec ,
+.Nm puzzle_compress_cvec ,
+.Nm puzzle_uncompress_cvec ,
+.Nm puzzle_vector_normalized_distance
+.Nd compute comparable signatures of bitmap images.
+.Sh SYNOPSIS
+.Fd #include <puzzle.h>
+.Ft void
+.Fn puzzle_init_context "PuzzleContext *context"
+.Ft void
+.Fn puzzle_free_context "PuzzleContext *context"
+.Ft void
+.Fn puzzle_init_cvec "PuzzleContext *context" "PuzzleCvec *cvec"
+.Ft void
+.Fn puzzle_init_dvec "PuzzleContext *context" "PuzzleDvec *dvec"
+.Ft int
+.Fn puzzle_fill_dvec_from_file "PuzzleContext *context" "PuzzleDvec * dvec" "const char *file"
+.Ft int
+.Fn puzzle_fill_cvec_from_file "PuzzleContext *context" "PuzzleCvec * cvec" "const char *file"
+.Ft int
+.Fn puzzle_fill_dvec_from_mem "PuzzleContext *context" "PuzzleDvec * dvec" "const void *mem" "size_t size"
+.Ft int
+.Fn puzzle_fill_cvec_from_mem "PuzzleContext *context" "PuzzleCvec * cvec" "const void *mem" "size_t size"
+.Ft int
+.Fn puzzle_fill_cvec_from_dvec "PuzzleContext *context" "PuzzleCvec * cvec" "const PuzzleDvec *dvec"
+.Ft void
+.Fn puzzle_free_cvec "PuzzleContext *context" "PuzzleCvec *cvec"
+.Ft void
+.Fn puzzle_free_dvec "PuzzleContext *context" "PuzzleDvec *dvec"
+.Ft void
+.Fn puzzle_init_compressed_cvec "PuzzleContext *context" "PuzzleCompressedCvec * compressed_cvec"
+.Ft void
+.Fn puzzle_free_compressed_cvec "PuzzleContext *context" "PuzzleCompressedCvec * compressed_cvec"
+.Ft int
+.Fn puzzle_compress_cvec "PuzzleContext *context" "PuzzleCompressedCvec * compressed_cvec" "const PuzzleCvec * cvec"
+.Ft int
+.Fn puzzle_uncompress_cvec "PuzzleContext *context" "PuzzleCompressedCvec * compressed_cvec" "PuzzleCvec * const cvec"
+.Ft double
+.Fn puzzle_vector_normalized_distance "PuzzleContext *context" "const PuzzleCvec * cvec1" "const PuzzleCvec * cvec2" "int fix_for_texts"
+.Sh DESCRIPTION
+The Puzzle library computes a signature out of a bitmap picture.
+Signatures are comparable and similar pictures have similar signatures.
+.Pp
+After a picture has been loaded and uncompressed, featureless parts of
+the image are skipped (autocrop), unless that step has been explicitely
+disabled, see
+.Xr puzzle_set 3
+.Sh LIBPUZZLE CONTEXT
+Every public function requires a
+.Va PuzzleContext
+object, that stores every required tunables.
+.Pp
+Any application using libpuzzle should initialize a
+.Va PuzzleContext
+object with
+.Fn puzzle_init_context
+and free it after use with
+.Fn puzzle_free_context
+.Bd \-literal \-offset indent
+PuzzleContext context;
+
+puzzle_init_context(&context);
+ ...
+puzzle_free_context(&context);
+.Ed
+.Sh DVEC AND CVEC VECTORS
+The next step is to divide the cropped image into a grid and to compute
+the average intensity of soft\(hyedged pixels in every block. The result is a
+.Va PuzzleDvec
+object.
+.Pp
+.Va PuzzleDvec
+objects should be initialized before use, with
+.Fn puzzle_init_dvec
+and freed after use with
+.Fn puzzle_free_dvec
+.Pp
+The
+.Va PuzzleDvec
+structure has two important fields:
+.Va vec
+is the pointer to the first element of the array containing the average
+intensities, and
+.Va sizeof_compressed_vec
+is the number of elements.
+.Pp
+.Va PuzzleDvec
+objects are not comparable, so what you usually want is to transform these
+objects into
+.Va PuzzleCvec
+objects.
+.Pp
+A
+.Va PuzzleCvec
+object is a vector with relationships between adjacent blocks from a
+.Va PuzzleDvec
+object.
+.Pp
+The
+.Fn puzzle_fill_cvec_from_dvec
+fills a
+.Va PuzzleCvec
+object from a
+.Va PuzzleDvec
+object.
+.Pp
+But just like the other structure,
+.Va PuzzleCvec
+objects must be initialized and freed with
+.Fn puzzle_init_cvec
+and
+.Fn puzzle_free_cvec
+.Pp
+.Va PuzzleCvec
+objects have a vector whoose first element is in the
+.Va vec
+field, and the number of elements is in the
+.Va sizeof_vec
+field
+.Sh LOADING PICTURES
+.Va PuzzleDvec
+and
+.Va PuzzleCvec
+objects can be computed from a bitmap picture file, with
+.Fn puzzle_fill_dvec_from_file
+and
+.Fn puzzle_fill_cvec_from_file
+.Pp
+.Em GIF
+,
+.Em PNG
+and
+.Em JPEG
+files formats are currently supported and automatically recognized.
+.Pp
+Here's a simple example that creates a
+.Va PuzzleCvec
+objects out of a file.
+.Bd \-literal \-offset indent
+PuzzleContext context;
+PuzzleCvec cvec;
+
+puzzle_init_context(&context);
+puzzle_init_cvec(&context, &cvec);
+puzzle_fill_cvec_from_file(&context, &cvec, "test\-picture.jpg");
+ ...
+puzzle_free_cvec(&context, &cvec);
+puzzle_free_context(&context);
+.Ed
+.Sh COMPARING VECTORS
+In order to check whether two pictures are similar, you need to compare their
+.Va PuzzleCvec
+signatures, using
+.Fn puzzle_vector_normalized_distance
+.Pp
+That function returns a distance, between 0.0 and 1.0. The lesser, the nearer.
+.Pp
+Tests on common pictures show that a normalized distance of 0.6 (also defined as
+.Va PUZZLE_CVEC_SIMILARITY_THRESHOLD
+) means that both pictures are visually similar.
+.Pp
+If that threshold is not right for your set of pictures, you can experiment
+with
+.Va PUZZLE_CVEC_SIMILARITY_HIGH_THRESHOLD
+,
+.Va PUZZLE_CVEC_SIMILARITY_LOW_THRESHOLD
+and
+.Va PUZZLE_CVEC_SIMILARITY_LOWER_THRESHOLD
+or with your own value.
+.Pp
+If the
+.Fa fix_for_texts
+of
+.Fn puzzle_vector_normalized_distance
+is
+.Em 1
+, a fix is applied to the computation in order to deal with bitmap pictures
+that contain text. That fix is recommended, as it allows using the same
+threshold for that kind of picture as for generic pictures.
+.Pp
+If
+.Fa fix_for_texts
+is
+.Em 0
+, that special way of computing the normalized distance is disabled.
+.Bd \-literal \-offset indent
+PuzzleContext context;
+PuzzleCvec cvec1, cvec2;
+double d;
+
+puzzle_init_context(&context);
+puzzle_init_cvec(&context, &cvec1);
+puzzle_init_cvec(&context, &cvec2);
+puzzle_fill_cvec_from_file(&context, &cvec1, "test\-picture\-1.jpg");
+puzzle_fill_cvec_from_file(&context, &cvec2, "test\-picture\-2.jpg");
+d = puzzle_vector_normalized_distance(&context, &cvec1, &cvec2, 1);
+if (d < PUZZLE_CVEC_SIMILARITY_THRESHOLD) {
+ puts("Pictures are similar");
+}
+puzzle_free_cvec(&context, &cvec2);
+puzzle_free_cvec(&context, &cvec1);
+puzzle_free_context(&context);
+.Ed
+.Sh CVEC COMPRESSION
+In order to reduce storage needs,
+.Va PuzzleCvec
+objects can be compressed to 1/3 of their original size.
+.Pp
+.Va PuzzleCompressedCvec
+structures hold the compressed data. Before and after use, these structures
+have to be passed to
+.Fn puzzle_init_compressed_cvec
+and
+.Fn puzzle_free_compressed_cvec
+.Pp
+.Fn puzzle_compress_cvec
+compresses a
+.Va PuzzleCvec
+object into a
+.Va PuzzleCompressedCvec
+object.
+.Pp
+And
+.Fn puzzle_uncompress_cvec
+uncompresses a
+.Va PuzzleCompressedCvec
+object into a
+.Va PuzzleCvec
+object.
+.Bd \-literal \-offset indent
+PuzzleContext context;
+PuzzleCvec cvec;
+PuzzleCompressedCvec c_cvec;
+ ...
+puzzle_init_compressed_cvec(&context, &c_cvec);
+puzzle_compress_cvec(&context, &c_cvec, &cvec);
+ ...
+puzzle_free_compressed_cvec(&context, &c_cvec);
+.Ed
+The
+.Va PuzzleCompressedCvec
+structure has two important fields:
+.Va vec
+that is a pointer to the first element of the compressed data, and
+.Va sizeof_compressed_vec
+that contains the number of elements.
+.Sh RETURN VALUE
+Functions return
+.Em 0
+on success, and
+.Em \-1
+if something went wrong.
+.Sh AUTHORS
+.Nf
+Frank DENIS
+libpuzzle at pureftpd dot org
+.Fi
+.Sh ACKNOWLEDGMENTS
+.Nf
+Xerox Research Center
+H. CHI WONG
+Marschall BERN
+David GOLDBERG
+Sameh SCHAFIK
+.Fi
+.Sh SEE ALSO
+.Xr puzzle_set 3
+.Xr puzzle\-diff 8
diff --git a/deduper/libpuzzle/man/puzzle-diff.8 b/deduper/libpuzzle/man/puzzle-diff.8
new file mode 100644
index 0000000..5744b5a
--- /dev/null
+++ b/deduper/libpuzzle/man/puzzle-diff.8
@@ -0,0 +1,58 @@
+.\"
+.\" Copyright (c) 2007-2014 Frank DENIS <j at pureftpd.org>
+.\"
+.\" Permission to use, copy, modify, and distribute this software for any
+.\" purpose with or without fee is hereby granted, provided that the above
+.\" copyright notice and this permission notice appear in all copies.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+.\"
+.Dd $Mdocdate: September 23 2007 $
+.Dt PUZZLE-DIFF 1
+.Os
+.Sh NAME
+.Nm puzzle\-diff
+.Nd compare pictures with libpuzzle
+.Sh SYNOPSIS
+.Nm puzzle\-diff
+[\-b <contrast barrier for cropping] [\-c] [\-C <max cropping ratio>]
+[\-e] [\-E <similarity threshold>] [\-h] [\-H <max height>] [\-l <lambdas>]
+[\-n <noise cutoff>] [\-p <p ratio>] [\-t] [\-W <max width>]
+<file 1>
+<file 2>
+.Sh DESCRIPTION
+puzzle\-diff compares two pictures and outputs the normalized distance.
+.Pp
+Try
+.Em puzzle\-diff \-h
+for more info.
+.Sh EXAMPLES
+Output distance between two images:
+.Bd -literal -offset indent
+$ puzzle\-diff pic\-a\-0.jpg pics\-a\-1.jpg
+0.102286
+.Ed
+.Pp
+Compare two images, exit with 10 if they look the same, exit with 20 if
+they don't (may be useful for scripts):
+.Bd -literal -offset indent
+$ puzzle\-diff \-e pic\-a\-0.jpg pics\-a\-1.jpg
+$ echo $?
+10
+.Ed
+.Pp
+Compute distance, without cropping and with computing the average intensity
+of the whole blocks:
+.Bd -literal -offset indent
+$ puzzle\-diff \-p 1.0 \-c pic\-a\-0.jpg pic\-a\-1.jpg
+0.0523151
+.Ed
+.Sh SEE ALSO
+.Xr libpuzzle 3
+.Xr puzzle_set 3
diff --git a/deduper/libpuzzle/man/puzzle_set.3 b/deduper/libpuzzle/man/puzzle_set.3
new file mode 100644
index 0000000..a8d017b
--- /dev/null
+++ b/deduper/libpuzzle/man/puzzle_set.3
@@ -0,0 +1,129 @@
+.\"
+.\" Copyright (c) 2007-2014 Frank DENIS <j at pureftpd.org>
+.\"
+.\" Permission to use, copy, modify, and distribute this software for any
+.\" purpose with or without fee is hereby granted, provided that the above
+.\" copyright notice and this permission notice appear in all copies.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+.\"
+.Dd $Mdocdate: September 24 2007 $
+.Dt PUZZLE_SET 3
+.Sh NAME
+.Nm puzzle_set_max_width ,
+.Nm puzzle_set_max_height ,
+.Nm puzzle_set_lambdas ,
+.Nm puzzle_set_p_ratio ,
+.Nm puzzle_set_noise_cutoff ,
+.Nm puzzle_set_contrast_barrier_for_cropping ,
+.Nm puzzle_set_max_cropping_ratio ,
+.Nm puzzle_set_autocrop
+.Nd set tunables for libpuzzle functions.
+.Sh SYNOPSIS
+.Fd #include <puzzle.h>
+.Ft int
+.Fn puzzle_set_max_width "PuzzleContext *context" "unsigned int width"
+.Ft int
+.Fn puzzle_set_max_height "PuzzleContext *context" "unsigned int height"
+.Ft int
+.Fn puzzle_set_lambdas "PuzzleContext *context" "unsigned int lambdas"
+.Ft int
+.Fn puzzle_set_p_ratio "PuzzleContext *context" "double p_ratio"
+.Ft int
+.Fn puzzle_set_noise_cutoff "PuzzleContext *context" "double noise_cutoff"
+.Ft int
+.Fn puzzle_set_contrast_barrier_for_cropping "PuzzleContext *context" "double barrier"
+.Ft int
+.Fn puzzle_set_max_cropping_ratio "PuzzleContext *context" "double ratio"
+.Ft int
+.Fn puzzle_set_autocrop "PuzzleContext *context" "int enable"
+.Sh DESCRIPTION
+While default values have been chosen to be ok for most people, the
+.Fn puzzle_set_*
+functions are knobs to fit the algorithm to your set of data and to your
+applications.
+.Sh LAMBDAS
+By default, pictures are divided in 9 x 9 blocks.
+.Pp
+.Em 9
+is the
+.Em lambdas
+value, and it can be changed with
+.Fn puzzle_set_lambdas
+.Pp
+For large databases, for complex images, for images with a lot of text or
+for sets of near\(hysimilar images, it might be better to raise that value to
+.Em 11
+or even
+.Em 13
+.Pp
+However, raising that value obviously means that vectors will require more
+storage space.
+.Pp
+The
+.Em lambdas
+value should remain the same in order to get comparable vectors. So if you
+pick
+.Em 11
+(for instance), you should always use that value for all pictures you will
+compute a digest for.
+.Fn puzzle_set_p_ratio
+.Pp
+The average intensity of each block is based upon a small centered zone.
+.Pp
+The "p ratio" determines the size of that zone. The default is 2.0, and that
+ratio mimics the behavior that is described in the reference algorithm.
+.Pp
+For very specific cases (complex images) or if you get too many false
+positives, as an alternative to increasing lambdas, you can try to lower that
+value, for instance to 1.5.
+.Pp
+The lowest acceptable value is 1.0.
+.Sh MAXIMUM SIZES
+In order to avoid CPU starvation, pictures won't be processed if their width
+or height is larger than 3000 pixels.
+.Pp
+These limits are rather large, but if you ever need to change them, the
+.Fn puzzle_set_max_width
+and
+.Fn puzzle_set_max_height
+are available.
+.Sh NOISE CUTOFF
+The noise cutoff defaults to 2. If you raise that value, more zones with
+little difference of intensity will be considered as similar.
+.Pp
+Unless you have very specialized sets of pictures, you probably don't want
+to change this.
+.Sh AUTOCROP
+By default, featureless borders of the original image are ignored. The size
+of each border depends on the sum of absolute values of differences between
+adjacent pixels, relative to the total sum.
+.Pp
+That feature can be disabled with
+.Fn puzzle_set_autocrop "0"
+Any other value will enable it.
+.Pp
+.Fn puzzle_set_contrast_barrier_for_cropping
+changes the tolerance. The default value is 5. Less shaves less, more shaves
+more.
+.Pp
+.Fn puzzle_set_max_cropping_ratio
+This is a safe\(hyguard against unwanted excessive auto\(hycropping.
+.Pp
+The default (0.25) means that no more than 25% of the total width (or
+height) will ever be shaved.
+.Sh RETURN VALUE
+Functions return
+.Em 0
+on success, and
+.Em \-1
+if something went wrong.
+.Sh SEE ALSO
+.Xr libpuzzle 3
+.Xr puzzle\-diff 8