aboutsummaryrefslogtreecommitdiff
path: root/xsig/include/signature_db.hpp
blob: fc98e6a767da4d9182f1149ffc79d3452ebd9ca0 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
//Chris Xiong 2022
//License: MPL-2.0
#ifndef SIGNATURE_DB_HPP
#define SIGNATURE_DB_HPP

#include <functional>
#include <filesystem>
#include <vector>

#include "signature.hpp"

namespace fs = std::filesystem;

struct subslice_t {size_t id; size_t slice;};

struct dupe_t {size_t id1, id2; double distance;};

struct populate_cfg_t
{
    size_t nsliceh;
    size_t nslicev;
    signature_config scfg_full;
    signature_config scfg_subslice;
    double threshold;
    std::function<void(size_t, int)> callback;
    int njobs;
};

struct signature_db_priv;

class signature_db
{
private:
    signature_db_priv *p;
public:
    //open a signature database
    //if dbpath is an empty path (default), the database will reside in RAM
    //and will be automatically initialized
    //otherwise it opens the database specified by dbpath
    //if the database specified by dbpath doesn't exist, it will be created
    //and initialized
    //if the database file exists but is not a valid signature database, it
    //will be immediately closed and any subsequent calls to this signature db
    //object will do nothing. The object will be marked invalid.
    signature_db(const fs::path &dbpath = fs::path());
    ~signature_db();

    //true if db is valid, false if not.
    //only useful when opening an existing db file
    bool valid();
    //true if db has changes that are not written to disk, false otherwise
    //always true if db is not in RAM.
    bool is_dirty();

    //insert image signature into database
    //if id is omitted, it's assigned automatically and returned
    //if specificted, id must be unique
    //treat automatically assigned id as arbitrary opaque integers
    size_t put_signature(const fs::path &path, const signature &sig, size_t id = ~size_t(0));
    void batch_get_signature_begin();
    //get image signature from database
    std::pair<fs::path, signature> get_signature(size_t id);
    void batch_get_signature_end();

    std::vector<size_t> get_image_ids();

    //place batch_put_subslice_begin() and batch_put_subslice_end() around a group of
    //put_subslice() calls to improve performance
    void batch_put_subslice_begin();
    //insert subslice into database
    //(id, slice) must be unique
    //calling put_subslice_begin() before this is NOT required, but
    //will improve performance
    void put_subslice(size_t id, size_t slice, const signature &slicesig);
    void batch_put_subslice_end();

    //same thing as put_subslice_begin()
    void batch_find_subslice_begin();
    //find identical subslices from database
    std::vector<subslice_t> find_subslice(const signature &slicesig);
    void batch_find_subslice_end();

    void put_dupe_pair(size_t ida, size_t idb, double dist);
    std::vector<dupe_t> dupe_pairs();

    void lock();
    void unlock();

    bool to_db_file(const fs::path &path);
    bool from_db_file(const fs::path &path);

    void populate(const std::vector<fs::path> &paths, const populate_cfg_t &cfg);
    void populate_interrupt();

    std::vector<std::pair<size_t, double>> search_image(const fs::path &path, const populate_cfg_t &cfg, bool insert = false);

    int64_t db_memory_usage();

    //disjoint set for keeping similar images in the same group
    //some of these probably shouldn't be public. TBD...
    void ds_init();

    void batch_ds_get_parent_begin();
    size_t ds_get_parent(size_t id);
    void batch_ds_get_parent_end();

    void batch_ds_set_parent_begin();
    size_t ds_set_parent(size_t id, size_t par);
    void batch_ds_set_parent_end();

    size_t ds_find(size_t id);
    void ds_merge(size_t id1, size_t id2);

    //group similar images together using results from dupe_pairs()
    //usually very fast, unless you have a crack ton of duplicates...
    void group_similar();
    //get all groups, each countained in their own lists.
    //now no longer returns group with size of 1.
    std::vector<std::vector<size_t>> groups_get();
};

#endif