/*====================================================================* - Copyright (C) 2001 Leptonica. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - 2. Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials - provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *====================================================================*/ /* * hash_reg.c * * Tests hashing functions for strings and points, and the use of them with: * * sets (underlying rbtree implementation for sorting) * * hash maps (underlying dnaHash implementation for accessing) * * For sets, it's important to use good 64-bit hashes to ensure that * collisions are very rare. With solid randomization, you expect * that a collision is likely with 2^32 or more hashed entities. * The probability of a collision goes as n^2, so with 10M entities, * the collision probabililty is about 10^-5. * * For the dna hashing, a faster but weaker hash function is used. * The hash should do a reasonable job of randomizing the lower order * bits corresponding to the prime number used with the mod function * for assigning to buckets. (To the extent that those bits are not * randomized, the calculation will run slower because bucket * occupancy will not be random, but the result will still be exact.) * Hash collisions in the key are allowed because the dna in * the selected bucket stores integers into arrays (of pts or strings, * for example), and not keys. The input point or string is hashed to * a bucket (a dna), which is then traversed, and each stored value * (an index) is used check if the point or string is in the associated * array at that location. * * Also tests similar functions directly (without hashing the number) for dna. * This will allow handling of both float64 and large integers that are * accurately represented by float64. */ #ifdef HAVE_CONFIG_H #include #endif /* HAVE_CONFIG_H */ #include "allheaders.h" static SARRAY *BuildShortStrings(l_int32 nchars, l_int32 add_dups); static PTA *BuildPointSet(l_int32 w, l_int32 h, l_int32 add_dups); l_int32 main(int argc, char **argv) { l_int32 ncolors, c1, c2, c3, s1; L_ASET *set; L_DNA *da1, *da2, *da3, *da4, *da5, *da6, *da7, *da8, *dav, *dac; L_DNAHASH *dahash; GPLOT *gplot; NUMA *nav, *nac; PTA *pta1, *pta2, *pta3; SARRAY *sa1, *sa2, *sa3, *sa4; PIX *pix1; L_REGPARAMS *rp; if (regTestSetup(argc, argv, &rp)) return 1; lept_mkdir("lept/hash"); /* Test string hashing with aset */ sa1 = BuildShortStrings(3, 0); sa2 = BuildShortStrings(3, 1); c1 = sarrayGetCount(sa1); c2 = sarrayGetCount(sa2); regTestCompareValues(rp, 18278, c1, 0); /* 0 */ regTestCompareValues(rp, 20982, c2, 0); /* 1 */ if (rp->display) { lept_stderr("Set results with string hashing:\n"); lept_stderr(" size with unique strings: %d\n", c1); lept_stderr(" size with dups: %d\n", c2); } startTimer(); set = l_asetCreateFromSarray(sa2); s1 = l_asetSize(set); regTestCompareValues(rp, 18278, s1, 0); /* 2 */ if (rp->display) { lept_stderr(" time to make set: %5.3f sec\n", stopTimer()); lept_stderr(" size of set without dups: %d\n", s1); } l_asetDestroy(&set); startTimer(); sa3 = sarrayRemoveDupsByAset(sa2); c1 = sarrayGetCount(sa3); regTestCompareValues(rp, 18278, c1, 0); /* 3 */ if (rp->display) { lept_stderr(" time to remove dups: %5.3f sec\n", stopTimer()); lept_stderr(" size without dups = %d\n", c1); } startTimer(); sa4 = sarrayIntersectionByAset(sa1, sa2); c1 = sarrayGetCount(sa4); regTestCompareValues(rp, 18278, c1, 0); /* 4 */ if (rp->display) { lept_stderr(" time to intersect: %5.3f sec\n", stopTimer()); lept_stderr(" intersection size = %d\n", c1); } sarrayDestroy(&sa3); sarrayDestroy(&sa4); /* Test sarray set operations with dna hash. * We use the same hash function as is used with aset. */ if (rp->display) { lept_stderr("\nDna hash results for sarray:\n"); lept_stderr(" size with unique strings: %d\n", sarrayGetCount(sa1)); lept_stderr(" size with dups: %d\n", sarrayGetCount(sa2)); } startTimer(); dahash = l_dnaHashCreateFromSarray(sa2); s1 = l_dnaHashGetTotalCount(dahash); regTestCompareValues(rp, 20982, s1, 0); /* 5 */ if (rp->display) { lept_stderr(" time to make hashmap: %5.3f sec\n", stopTimer()); lept_stderr(" entries in hashmap with dups: %d\n", s1); } l_dnaHashDestroy(&dahash); startTimer(); sarrayRemoveDupsByHash(sa2, &sa3, NULL); c1 = sarrayGetCount(sa3); regTestCompareValues(rp, 18278, c1, 0); /* 6 */ if (rp->display) { lept_stderr(" time to remove dups: %5.3f sec\n", stopTimer()); lept_stderr(" size without dups = %d\n", c1); } startTimer(); sa4 = sarrayIntersectionByHash(sa1, sa2); c1 = sarrayGetCount(sa4); regTestCompareValues(rp, 18278, c1, 0); /* 7 */ if (rp->display) { lept_stderr(" time to intersect: %5.3f sec\n", stopTimer()); lept_stderr(" intersection size = %d\n", c1); } sarrayDestroy(&sa3); sarrayDestroy(&sa4); sarrayDestroy(&sa1); sarrayDestroy(&sa2); /* Test point hashing with aset. * Enter all points within a 300 x 300 image in pta1, and include * 18,000 duplicates in pta2. With this pt hashing function, * there are no hash collisions among any of the 400 million pixel * locations in a 20000 x 20000 image. */ pta1 = BuildPointSet(300, 300, 0); pta2 = BuildPointSet(300, 300, 1); c1 = ptaGetCount(pta1); c2 = ptaGetCount(pta2); regTestCompareValues(rp, 90000, c1, 0); /* 8 */ regTestCompareValues(rp, 108000, c2, 0); /* 9 */ if (rp->display) { lept_stderr("\nSet results for pta:\n"); lept_stderr(" pta1 size with unique points: %d\n", c1); lept_stderr(" pta2 size with dups: %d\n", c2); } startTimer(); pta3 = ptaRemoveDupsByAset(pta2); c1 = ptaGetCount(pta3); regTestCompareValues(rp, 90000, c1, 0); /* 10 */ if (rp->display) { lept_stderr(" Time to remove dups: %5.3f sec\n", stopTimer()); lept_stderr(" size without dups = %d\n", ptaGetCount(pta3)); } ptaDestroy(&pta3); startTimer(); pta3 = ptaIntersectionByAset(pta1, pta2); c1 = ptaGetCount(pta3); regTestCompareValues(rp, 90000, c1, 0); /* 11 */ if (rp->display) { lept_stderr(" Time to intersect: %5.3f sec\n", stopTimer()); lept_stderr(" intersection size = %d\n", c1); } ptaDestroy(&pta1); ptaDestroy(&pta2); ptaDestroy(&pta3); /* Test pta set operations with dna hash, using the same pt hashing * function. Although there are no collisions in 20K x 20K images, * the dna hash implementation works properly even if there are some. */ pta1 = BuildPointSet(400, 400, 0); pta2 = BuildPointSet(400, 400, 1); c1 = ptaGetCount(pta1); c2 = ptaGetCount(pta2); regTestCompareValues(rp, 160000, c1, 0); /* 12 */ regTestCompareValues(rp, 192000, c2, 0); /* 13 */ if (rp->display) { lept_stderr("\nDna hash results for pta:\n"); lept_stderr(" pta1 size with unique points: %d\n", c1); lept_stderr(" pta2 size with dups: %d\n", c2); } startTimer(); ptaRemoveDupsByHash(pta2, &pta3, NULL); c1 = ptaGetCount(pta3); regTestCompareValues(rp, 160000, c1, 0); /* 14 */ if (rp->display) { lept_stderr(" Time to remove dups: %5.3f sec\n", stopTimer()); lept_stderr(" size without dups = %d\n", c1); } ptaDestroy(&pta3); startTimer(); pta3 = ptaIntersectionByHash(pta1, pta2); c1 = ptaGetCount(pta3); regTestCompareValues(rp, 160000, c1, 0); /* 15 */ if (rp->display) { lept_stderr(" Time to intersect: %5.3f sec\n", stopTimer()); lept_stderr(" intersection size = %d\n", c1); } ptaDestroy(&pta1); ptaDestroy(&pta2); ptaDestroy(&pta3); /* Test dna set and histo operations using dna hash */ da1 = l_dnaMakeSequence(0.0, 0.125, 8000); da2 = l_dnaMakeSequence(300.0, 0.125, 8000); da3 = l_dnaMakeSequence(600.0, 0.125, 8000); da4 = l_dnaMakeSequence(900.0, 0.125, 8000); da5 = l_dnaMakeSequence(1200.0, 0.125, 8000); l_dnaJoin(da1, da2, 0, -1); l_dnaJoin(da1, da3, 0, -1); l_dnaJoin(da1, da4, 0, -1); l_dnaJoin(da1, da5, 0, -1); l_dnaRemoveDupsByHash(da1, &da6, &dahash); l_dnaHashDestroy(&dahash); c1 = l_dnaGetCount(da1); c2 = l_dnaGetCount(da6); regTestCompareValues(rp, 40000, c1, 0); /* 16 */ regTestCompareValues(rp, 17600, c2, 0); /* 17 */ if (rp->display) { lept_stderr("\nDna hash results for dna:\n"); lept_stderr(" dna size with dups = %d\n", c1); lept_stderr(" dna size of unique numbers = %d\n", c2); } l_dnaMakeHistoByHash(da1, &dahash, &dav, &dac); nav = l_dnaConvertToNuma(dav); nac = l_dnaConvertToNuma(dac); c1 = l_dnaGetCount(dac); regTestCompareValues(rp, 17600, c1, 0); /* 18 */ if (rp->display) lept_stderr(" dna number of histo points = %d\n", c1); /* Plot it twice */ pix1 = gplotGeneralPix2(nav, nac, GPLOT_IMPULSES, "/tmp/lept/hash/histo", "Histo", NULL, NULL); gplot = gplotSimpleXY1(nav, nac, GPLOT_IMPULSES, GPLOT_PNG, "/tmp/lept/hash/histo", "Histo"); if (rp->display) { pixDisplay(pix1, 700, 100); l_fileDisplay("/tmp/lept/hash/histo.png", 700, 600, 1.0); } pixDestroy(&pix1); gplotDestroy(&gplot); da7 = l_dnaIntersectionByHash(da2, da3); c1 = l_dnaGetCount(da2); c2 = l_dnaGetCount(da3); c3 = l_dnaGetCount(da7); regTestCompareValues(rp, 8000, c1, 0); /* 19 */ regTestCompareValues(rp, 8000, c2, 0); /* 20 */ regTestCompareValues(rp, 5600, c3, 0); /* 21 */ if (rp->display) { lept_stderr(" dna num points: da2 = %d, da3 = %d\n", c1, c2); lept_stderr(" dna num da2/da3 intersection points = %d\n", c3); } l_dnaDestroy(&da1); l_dnaDestroy(&da2); l_dnaDestroy(&da3); l_dnaDestroy(&da4); l_dnaDestroy(&da5); l_dnaDestroy(&da6); l_dnaDestroy(&da7); l_dnaDestroy(&dac); l_dnaDestroy(&dav); l_dnaHashDestroy(&dahash); numaDestroy(&nav); numaDestroy(&nac); /* Test pixel counting operations with hashmap and ordered map */ pix1 = pixRead("wet-day.jpg"); pixCountRGBColorsByHash(pix1, &c1); pixCountRGBColors(pix1, 1, &c2); regTestCompareValues(rp, 42427, c1, 0); /* 22 */ regTestCompareValues(rp, 42427, c2, 0); /* 23 */ if (rp->display) { lept_stderr("\nColor count using dna hash: %d\n", c1); lept_stderr("Color count using amap: %d\n", c2); } pixDestroy(&pix1); /* Test aset operations on dna. */ da1 = l_dnaMakeSequence(0, 3, 10000); da2 = l_dnaMakeSequence(0, 5, 10000); da3 = l_dnaMakeSequence(0, 7, 10000); l_dnaJoin(da1, da2, 0, -1); l_dnaJoin(da1, da3, 0, -1); set = l_asetCreateFromDna(da1); c1 = l_dnaGetCount(da1); c2 = l_asetSize(set); regTestCompareValues(rp, 30000, c1, 0); /* 24 */ regTestCompareValues(rp, 25428, c2, 0); /* 25 */ if (rp->display) { lept_stderr("\nDna results using set:\n"); lept_stderr(" da1 count: %d\n", c1); lept_stderr(" da1 set size: %d\n\n", c2); } l_asetDestroy(&set); da4 = l_dnaUnionByAset(da2, da3); set = l_asetCreateFromDna(da4); c1 = l_dnaGetCount(da4); c2 = l_asetSize(set); regTestCompareValues(rp, 18571, c1, 0); /* 26 */ regTestCompareValues(rp, 18571, c2, 0); /* 27 */ if (rp->display) { lept_stderr(" da4 count: %d\n", c1); lept_stderr(" da4 set size: %d\n\n", c2); } l_asetDestroy(&set); da5 = l_dnaIntersectionByAset(da1, da2); set = l_asetCreateFromDna(da5); c1 = l_dnaGetCount(da5); c2 = l_asetSize(set); regTestCompareValues(rp, 10000, c1, 0); /* 28 */ regTestCompareValues(rp, 10000, c2, 0); /* 29 */ if (rp->display) { lept_stderr(" da5 count: %d\n", c1); lept_stderr(" da5 set size: %d\n\n", c2); } l_asetDestroy(&set); da6 = l_dnaMakeSequence(100000, 11, 5000); l_dnaJoin(da6, da1, 0, -1); set = l_asetCreateFromDna(da6); c1 = l_dnaGetCount(da6); c2 = l_asetSize(set); regTestCompareValues(rp, 35000, c1, 0); /* 30 */ regTestCompareValues(rp, 30428, c2, 0); /* 31 */ if (rp->display) { lept_stderr(" da6 count: %d\n", c1); lept_stderr(" da6 set size: %d\n\n", c2); } l_asetDestroy(&set); da7 = l_dnaIntersectionByAset(da6, da3); set = l_asetCreateFromDna(da7); c1 = l_dnaGetCount(da7); c2 = l_asetSize(set); regTestCompareValues(rp, 10000, c1, 0); /* 32 */ regTestCompareValues(rp, 10000, c2, 0); /* 33 */ if (rp->display) { lept_stderr(" da7 count: %d\n", c1); lept_stderr(" da7 set size: %d\n\n", c2); } l_asetDestroy(&set); da8 = l_dnaRemoveDupsByAset(da1); c1 = l_dnaGetCount(da8); regTestCompareValues(rp, 25428, c1, 0); /* 34 */ if (rp->display) lept_stderr(" da8 count: %d\n\n", c1); l_dnaDestroy(&da1); l_dnaDestroy(&da2); l_dnaDestroy(&da3); l_dnaDestroy(&da4); l_dnaDestroy(&da5); l_dnaDestroy(&da6); l_dnaDestroy(&da7); l_dnaDestroy(&da8); return regTestCleanup(rp); } /* Build all possible strings, up to a max of 5 roman alphabet characters */ static SARRAY * BuildShortStrings(l_int32 nchars, /* 3, 4 or 5 */ l_int32 add_dups) { char buf[64]; l_int32 i, j, k, l, m; l_uint64 hash; SARRAY *sa; sa = sarrayCreate(1000); for (i = 0; i < 26; i++) { snprintf(buf, sizeof(buf), "%c", i + 0x61); sarrayAddString(sa, buf, L_COPY); for (j = 0; j < 26; j++) { snprintf(buf, sizeof(buf), "%c%c", i + 0x61, j + 0x61); sarrayAddString(sa, buf, L_COPY); for (k = 0; k < 26; k++) { snprintf(buf, sizeof(buf), "%c%c%c", i + 0x61, j + 0x61, k + 0x61); sarrayAddString(sa, buf, L_COPY); if (add_dups && k < 4) /* add redundant strings */ sarrayAddString(sa, buf, L_COPY); if (nchars > 3) { for (l = 0; l < 26; l++) { snprintf(buf, sizeof(buf), "%c%c%c%c", i + 0x61, j + 0x61, k + 0x61, l + 0x61); sarrayAddString(sa, buf, L_COPY); if (add_dups && l < 4) /* add redundant strings */ sarrayAddString(sa, buf, L_COPY); if (nchars > 4) { for (m = 0; m < 26; m++) { snprintf(buf, sizeof(buf), "%c%c%c%c%c", i + 0x61, j + 0x61, k + 0x61, l + 0x61, m + 0x61); sarrayAddString(sa, buf, L_COPY); if (!add_dups && i == 17 && j == 12 && k == 4 && l == 21) { l_hashStringToUint64(buf, &hash); lept_stderr(" %llx\n", hash); } if (add_dups && m < 4) /* add redundant */ sarrayAddString(sa, buf, L_COPY); } } } } } } } return sa; } static PTA * BuildPointSet(l_int32 w, l_int32 h, l_int32 add_dups) { l_int32 i, j; PTA *pta; pta = ptaCreate(w * h); for (i = 0; i < h; i++) { for (j = 0; j < w; j++) ptaAddPt(pta, j, i); if (add_dups) { /* extra (0.2 * w * h) points */ for (j = 0.4 * w; j < 0.6 * w; j++) ptaAddPt(pta, j, i); } } return pta; }