Files
monero-gui/src/zxcvbn-c/dict-generate.cpp
2016-12-17 11:32:33 +00:00

1763 lines
54 KiB
C++

/**********************************************************************************
* Program to generate the dictionary for the C implementation of the zxcvbn password estimator.
* Copyright (c) 2015, Tony Evans
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
* permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this list
* of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or other
* materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors may be
* used to endorse or promote products derived from this software without specific
* prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
* SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*
**********************************************************************************/
#include <iostream>
#include <string>
#include <fstream>
#include <list>
#include <set>
#include <vector>
#include <map>
#include <memory>
#include <limits>
#include <stdlib.h>
#include <string.h>
#include <math.h>
using namespace std;
class Node;
typedef std::shared_ptr<Node> NodeSPtr;
typedef std::weak_ptr<Node> NodeWPtr;
typedef std::map<char, NodeSPtr> NodeMap_t;
typedef unsigned int Check_t;
/**********************************************************************************
* Class to perform CRC checksum calculation.
*/
class TrieCheck
{
public:
typedef uint64_t Check_t;
static const Check_t CHK_INIT = 0xffffffffffffffff;
TrieCheck() { Init(); }
void Init() { mCrc = CHK_INIT; }
operator Check_t() const { return Result(); }
Check_t Result() const { return mCrc; }
bool operator ! () const { return mCrc == CHK_INIT; }
void operator () (const void *, unsigned int);
protected:
Check_t mCrc;
};
/**********************************************************************************
* Class to hold a node within the trie
*/
class Node
{
public:
Node();
Node(const Node &);
~Node();
Node & operator = (const Node &);
//bool operator == (const Node & r) const { return !IsEqual(r); }
//bool operator != (const Node & r) const { return !IsEqual(r); }
void SetEnd() { mEnd = true; }
bool IsEnd() const { return mEnd; }
int Height() const { return mHeight; }
// Scan the trie and count nodes
int NodeCount() { ClearCounted() ; return CountNodes(); }
int CalcAddress() { int a=0; ClearCounted(); a=CalcAddr(a, true); return CalcAddr(a, false); }
Node *GetParent() { return mParent; }
unsigned int GetAddr() const { return mAddr; }
NodeMap_t::iterator ChildBegin() { return mChild.begin(); }
NodeMap_t::iterator ChildEnd() { return mChild.end(); }
int GetNumEnds() const { return mEndings; }
NodeSPtr FindChild(char);
std::string GetChildChars();
TrieCheck::Check_t CalcCheck();
int CalcEndings();
int CalcHeight();
NodeSPtr AddChild(char);
void ChangeChild(NodeSPtr &, NodeSPtr &);
// bool IsEqual(const Node &) const;
void ClearCounted();
void SetCounted() { mCounted = true; }
bool IsCounted() const { return mCounted; }
protected:
int CountNodes();
int CalcAddr(int, bool);
NodeMap_t mChild;
Node *mParent;
int mEndings;
int mHeight;
unsigned int mAddr;
TrieCheck mCheck;
bool mEnd;
bool mCounted;
};
/**********************************************************************************
* Static table used for the crc implementation.
*/
static const TrieCheck::Check_t CrcTable[16] =
{
0x0000000000000000, 0x7d08ff3b88be6f81, 0xfa11fe77117cdf02, 0x8719014c99c2b083,
0xdf7adabd7a6e2d6f, 0xa2722586f2d042ee, 0x256b24ca6b12f26d, 0x5863dbf1e3ac9dec,
0x95ac9329ac4bc9b5, 0xe8a46c1224f5a634, 0x6fbd6d5ebd3716b7, 0x12b5926535897936,
0x4ad64994d625e4da, 0x37deb6af5e9b8b5b, 0xb0c7b7e3c7593bd8, 0xcdcf48d84fe75459
};
// Update the crc value with new data.
void TrieCheck::operator () (const void *v, unsigned int Len)
{
Check_t Crc = mCrc;
const unsigned char *Data = reinterpret_cast<const unsigned char *>(v);
while(Len--)
{
Crc = CrcTable[(Crc ^ (*Data >> 0)) & 0x0f] ^ (Crc >> 4);
Crc = CrcTable[(Crc ^ (*Data >> 4)) & 0x0f] ^ (Crc >> 4);
++Data;
}
mCrc = Crc;
}
Node::Node()
{
mEndings = -1;
mHeight = -1;
mEnd = false;
mParent = 0;
}
Node::Node(const Node &r)
{
*this = r;
}
Node::~Node()
{
}
Node &Node::operator = (const Node & r)
{
mChild = r.mChild;
mParent = r.mParent;
mEndings = r.mEndings;
mHeight = r.mHeight;
mCheck = r.mCheck;
mEnd = r.mEnd;
return *this;
}
/**********************************************************************************
* Generate a checksum for the current node. Value also depends of the
* checksum of any child nodes
*/
TrieCheck::Check_t Node::CalcCheck()
{
if (!mCheck)
{
// Not done this node before
char c;
NodeMap_t::iterator It;
mCheck.Init();
// Include number of children
c = mChild.size();
mCheck(&c, sizeof c);
// For each child include its character and node checksum
for(It = mChild.begin(); It != mChild.end(); ++It)
{
Check_t n = It->second->CalcCheck();
c = It->first;
mCheck(&c, sizeof c);
mCheck(&n, sizeof n);
}
// Finally include whether this node is an ending in the chaecksum
c = mEnd;
mCheck(&c, sizeof c);
}
return mCheck;
}
/**********************************************************************************
* Get number of nodes for this which end/finish a word
*/
int Node::CalcEndings()
{
if (mEndings < 0)
{
// Not already done this node,so calculate the ends
int n = 0;
NodeMap_t::iterator It;
// Number of endings is sum of the endings of the child nodes and plus this node if it ends a word
for(It = mChild.begin(); It != mChild.end(); ++It)
n += It->second->CalcEndings();
n += !!mEnd;
mEndings = n;
}
return mEndings;
}
/**********************************************************************************
* Calculate the height of the trie starting at current node
*/
int Node::CalcHeight()
{
if (mHeight < 0)
{
// Not already done this node,so calculate the height
int Hi = 0;
NodeMap_t::iterator It;
// Get height of all child nodes, remember the highest
for(It = mChild.begin(); It != mChild.end(); ++It)
{
int i = It->second->CalcHeight();
if (i >= Hi)
Hi = i+1;
}
mHeight = Hi;
}
return mHeight;
}
/**********************************************************************************
* Clear indication that node has been counted
*/
void Node::ClearCounted()
{
NodeMap_t::iterator It;
mCounted = false;
for(It = mChild.begin(); It != mChild.end(); ++It)
It->second->ClearCounted();
}
/**********************************************************************************
* Count this plus the number of child nodes. As part of the tree node count
* scan, make sure not to double count nodes
*/
int Node::CountNodes()
{
// Count is 0 if already done
if (mCounted)
return 0;
mCounted = true;
NodeMap_t::iterator It;
int i = 1; // 1 for this node
// Add the child nodes
for(It = mChild.begin(); It != mChild.end(); ++It)
i += It->second->CountNodes();
return i;
}
/**********************************************************************************
* Calculate the final node address
*/
int Node::CalcAddr(int Start, bool ManyEnds)
{
NodeMap_t::iterator It;
if (!(mCounted || (ManyEnds && (mEndings < 256))))
{
mCounted = true;
mAddr = Start++;
}
for(It = mChild.begin(); It != mChild.end(); ++It)
Start = It->second->CalcAddr(Start, ManyEnds);
return Start;
}
/**********************************************************************************
* Add the given character to the current node, return the next lower node
*/
NodeSPtr Node::AddChild(char c)
{
NodeMap_t::iterator It;
// Find character in map of child nodes
It = mChild.find(c);
if (It == mChild.end())
{
// New character, create new child node
NodeSPtr a(new Node);
a->mParent = this;
std::pair<char, NodeSPtr> x(c, a);
std::pair<NodeMap_t::iterator, bool> y = mChild.insert(x);
It = y.first;
}
return It->second;
}
/**********************************************************************************
* Find the child node which corresponds to the given character.
*/
NodeSPtr Node::FindChild(char Ch)
{
NodeMap_t::iterator It;
It = mChild.find(Ch);
if (It == mChild.end())
return NodeSPtr();
return It->second;
}
/**********************************************************************************
* Replace the current child node (old param) with a new child (Replace param),
* and update the new child parent.
*/
void Node::ChangeChild(NodeSPtr & Replace, NodeSPtr & Old)
{
NodeMap_t::iterator It;
for(It = mChild.begin(); It != mChild.end(); ++It)
{
NodeSPtr p = It->second;
if (p == Old)
{
It->second = Replace;
Replace->mParent = this;
break;
}
}
}
/**********************************************************************************
* Find all the characters corresponding to the children of this node.
*/
std::string Node::GetChildChars()
{
NodeMap_t::iterator It;
std::string Result;
for(It = mChild.begin(); It != mChild.end(); ++It)
{
char c = It->first;
Result += c;
}
return Result;
}
/**********************************************************************************
* struct to hold data read from input file (except for the word string)
*/
struct Entry
{
Entry() : mRank(0), mDict(0), mOrder(0), mOccurs(0) {}
int mRank;
int mDict;
int mOrder;
int mOccurs;
};
/**********************************************************************************
* Struct to hold a string and an int. Also provide the compare operators for std::set class
*/
struct StringInt
{
string s;
unsigned int i;
StringInt() { i=0; }
StringInt(const StringInt & r) : s(r.s), i(r.i) {}
StringInt & operator = (const StringInt & r) { i = r.i; s = r.s; return *this; }
bool operator < (const StringInt & r) const { return s < r.s; }
bool operator > (const StringInt & r) const { return s > r.s; }
bool operator == (const StringInt & r) const { return s == r.s; }
StringInt * Self() const { return const_cast<StringInt *>(this); }
};
typedef map<string, Entry> EntryMap_t;
typedef list<string> StringList_t;
typedef list<NodeSPtr> NodeList_t;
typedef set<StringInt> StringIntSet_t;
typedef basic_string<int> StringOfInts;
typedef vector<unsigned int> UintVect;
typedef vector<uint64_t> Uint64Vect;
typedef vector<StringInt *> StrIntPtrVect_t;
typedef vector<StringInt> StringIntVect_t;
// Variables holding 'interesting' information on the data
unsigned int MaxLength, MinLength, NumChars, NumInWords, NumDuplicate;
struct FileInfo
{
FileInfo() : Words(0), BruteIgnore(0), Accented(0), Dups(0), Used(0), Rank(0) { }
string Name;
StringList_t Pwds;
int Words;
int BruteIgnore;
int Accented;
int Dups;
int Used;
int Rank;
};
/**********************************************************************************
* Read the file of words and add them to the file information.
*/
static bool ReadInputFile(const string & FileName, FileInfo &Info, int MaxRank)
{
ifstream f(FileName.c_str());
if (!f.is_open())
{
cerr << "Error opening " << FileName << endl;
return false;
}
Info.Name = FileName;
// Rank is the position of the work in the dictionary file. Rank==1 is lowest for a word (and
// indicates a very popular or bad password).
int Rank = 0;
string Line;
while(getline(f, Line) && (Rank < MaxRank))
{
// Truncate at first space or tab to leave just the word in case additional info on line
string::size_type y = Line.find_first_of("\t ");
if (y != string::npos)
Line.erase(y);
y = Line.length();
if (!y)
continue;
++Info.Words;
// Only use words where all chars are ascii (no accents etc.)
string::size_type x;
double BruteForce = 1.0;
for(x = 0; x < y; ++x)
{
unsigned char c = Line[x];
if (c >= 128)
break;
c = tolower(c);
Line[x] = c;
BruteForce *= 26.0;
}
if (x < y)
{
++Info.Accented;
continue;
}
// Don't use words where the brute force strength is less than the word's rank
if (BruteForce < (Rank+1))
{
++Info.BruteIgnore;
continue;
}
// Remember some interesting info
if (y > MaxLength)
MaxLength = y;
if (y < MinLength)
MinLength = y;
NumChars += y;
Info.Pwds.push_back(Line);
++Rank;
}
f.close();
return true;
}
static void CombineWordLists(EntryMap_t & Entries, FileInfo *Infos, int NumInfo)
{
bool Done = false;
int Rank = 0;
while(!Done)
{
int i;
++Rank;
Done = true;
for(i = 0; i < NumInfo; ++i)
{
FileInfo *p = Infos + i;
while(!p->Pwds.empty())
{
Done = false;
string Word = p->Pwds.front();
p->Pwds.pop_front();
EntryMap_t::iterator It = Entries.find(Word);
if (It != Entries.end())
{
// Word is repeat of one from another file
p->Dups += 1;
++NumDuplicate;
}
else
{
// New word, add it
Entry e;
e.mDict = i;
e.mRank = Rank;
Entries.insert(std::pair<std::string, Entry>(Word, e));
p->Used += 1;
break;
}
}
}
}
}
/**********************************************************************************
* Use all words previously read from file(s) and add them to a Trie, which starts
* at Root. Also update a bool array indicating the chars used in the words.
*/
static void ProcessEntries(NodeSPtr Root, EntryMap_t & Entries, bool *InputCharSet)
{
EntryMap_t::iterator It;
std::string Text;
for(It = Entries.begin(); It != Entries.end(); ++It)
{
Text = It->first;
// Add latest word to tree
string::size_type x;
NodeSPtr pNode = Root;
for(x = 0; x < Text.length(); ++x)
{
char c = Text[x];
pNode = pNode->AddChild(c);
// Add char to set of used character codes
InputCharSet[c & 0xFF] = true;
}
pNode->SetEnd();
}
}
/**********************************************************************************
* Add the passed node to the list if it has same height as value in Hi (= number
* of steps to get to a terminal node). If current node has height greater than Hi,
* recursivly call with each child node as one of these may be at the required height.
*/
static void AddToListAtHeight(NodeList_t & Lst, NodeSPtr Node, int Hi)
{
if (Hi == Node->Height())
{
Lst.push_back(Node);
return;
}
if (Hi < Node->Height())
{
NodeMap_t::iterator It;
for(It = Node->ChildBegin(); It != Node->ChildEnd(); ++It)
{
AddToListAtHeight(Lst, It->second, Hi);
}
}
}
/**********************************************************************************
* Scan the trie and update the original word list with the alphabetical order
* (or 'index location') of the words
*/
static void ScanTrieForOrder(EntryMap_t & Entries, int & Ord, NodeSPtr Root, const string & Str)
{
if (Root->IsEnd())
{
// Root is a word ending node, so store its index in the input word store
EntryMap_t::iterator Ite;
Ite = Entries.find(Str);
if (Ite == Entries.end())
throw "Trie string not in entries";
Ite->second.mOrder = ++Ord;
}
NodeMap_t::iterator It;
string Tmp;
// For each child, append its character to the current word string and do a recursive
// call to update their word indexes.
for(It = Root->ChildBegin(); It != Root->ChildEnd(); ++It)
{
Tmp = Str + It->first;
ScanTrieForOrder(Entries, Ord, It->second, Tmp);
}
}
/**********************************************************************************
* Reduce the trie by merging tails where possible. Starting at greatest height,
* get a list of all nodes with given height, then test for identical nodes. If
* found, change the parent of the second identical node to use the first node,
* and delete second node and its children. Reduce height by one and repeat
* until height is zero.
*/
static void ReduceTrie(NodeSPtr Root)
{
int Height;
int cnt=0, del=0;
Root->CalcCheck();
NodeSPtr pNode = Root;
for(Height = Root->CalcHeight(); Height >= 0; --Height)
{
// Get a list of all nodes at given height
int x=0;
NodeList_t Lst;
AddToListAtHeight(Lst, Root, Height);
cnt += Lst.size();
NodeList_t::iterator Ita, Itb;
for(Ita = Lst.begin(); Ita != Lst.end(); ++Ita)
{
// Going to use a CRC to decide if two nodes are identical
TrieCheck::Check_t Chka = (*Ita)->CalcCheck();
Itb = Ita;
for(++Itb; Itb != Lst.end(); )
{
if (Chka == (*Itb)->CalcCheck())
{
// Found two identical nodes (with identical children)
Node * Parentb = (*Itb)->GetParent();
if (Parentb)
{
// Change the 2nd parent to use the current node as child
// Remove the 2nd node from the scanning list to as it will
// get deleted by the sharing (as using std::shared_ptr)
Parentb->ChangeChild(*Ita, *Itb);
++x;++del;
Itb = Lst.erase(Itb);
}
else
{
cout << " orphan ";
++Itb;
}
}
else
{
++Itb;
}
}
}
}
}
/**********************************************************************************
* Scan the trie to match with the supplied word. Return the order of the
* word, or -1 if it is not in the trie.
*/
static int CheckWord(NodeSPtr Root, const string & Str)
{
int i = 1;
bool e = false;
string::size_type x;
NodeSPtr p = Root;
for(x = 0; x < Str.length(); ++x)
{
int j;
NodeMap_t::iterator It;
// Scan children to find one that matches current character
char c = Str[x];
for(It = p->ChildBegin(); It != p->ChildEnd(); ++It)
{
if (It->first == c)
break;
// Add the number of endings at or below child to track the alphabetical index
j = It->second->CalcEndings();
i += j;
}
// Fail if no child matches the character
if (It == p->ChildEnd())
return -1;
// Allow for this node being a word ending
e = p->IsEnd();
if (e)
++i;
p = It->second;
}
if (p && p->IsEnd())
{
if (x == Str.length())
return i;
}
return -1;
}
/**********************************************************************************
* Try to find every input word in the reduced trie. The order should also
* match, otherwise the reduction has corrupted the trie.
*/
static int CheckReduction(StringIntVect_t & Ranks, NodeSPtr Root, EntryMap_t & Entries)
{
int i = 0;
int n = 0;
EntryMap_t::iterator It;
std::string Text;
int b;
Ranks.resize(Entries.size()+1);
for(It = Entries.begin(); (It != Entries.end()) && (i <= 200000); ++It)
{
Text = It->first;
b = CheckWord(Root, Text);
if (b < 0)
{
++i;
cout << It->second.mOrder << ": Missing " << Text.c_str() << endl;
}
else if (It->second.mOrder != b)
{
++i;
cout << It->second.mOrder << ": Bad order " << b << " for " << Text.c_str() << endl;
}
else
{
//if (Text == "fred")
// cout << Text.c_str() << "-> " << It->second.mOrder << ", " << It->second.mRank << endl;
++n;
}
if (b >= int(Ranks.size()))
throw " Using Ranks beyond end";
if (b >= 0)
{
char Tmp[20];
Ranks[b].i = It->second.mRank;
sprintf(Tmp, "%d: ", n);
Ranks[b].s = string(Tmp) + Text;
}
// Try to find a non-existant word
Text.insert(0, "a");
Text += '#';
b = CheckWord(Root, Text);
if (b > 0)
throw string("Found non-existant word ") + Text;
}
if (i > 0)
throw "Missing words in reduction check = " + to_string(i);
return n;
}
struct ChkNum
{
int Match;
int Err;
ChkNum() : Match(0), Err(0) {}
ChkNum(const ChkNum &r) : Match(r.Match), Err(r.Err) {}
ChkNum & operator = (const ChkNum & r) { Match = r.Match; Err = r.Err; return *this; }
ChkNum & operator += (const ChkNum & r) { Match += r.Match; Err += r.Err; return *this; }
};
/**********************************************************************************
* Find all possible words in the trie and make sure they are input words.
* Return number of words found. Done as a second trie check.
*/
static ChkNum CheckEntries(NodeSPtr Root, string Str, const EntryMap_t & Entries)
{
ChkNum Ret;
if (Root->IsEnd())
{
// This is an end node, find the word in the input words
EntryMap_t::const_iterator It = Entries.find(Str);
if (It != Entries.end())
++Ret.Match;
else
++Ret.Err;
}
// Add each child character to the passed string and recursively check
NodeMap_t::iterator It;
for(It = Root->ChildBegin(); It != Root->ChildEnd(); ++It)
{
string Tmp = Str;
Tmp += It->first;
Ret += CheckEntries(It->second, Tmp, Entries);
}
return Ret;
}
/**********************************************************************************
* Convert the passed bool array of used chars into a character string
*/
string MakeCharSet(bool *InputCharSet)
{
int i;
string s;
for(i = 1; i < 256; ++i)
{
if (InputCharSet[i])
s += char(i);
}
return s;
}
/**********************************************************************************
* Create a set of strings which contain the possible characters matched at
* a node when checking a word.
*/
void MakeChildBitMap(StringIntSet_t & StrSet, NodeSPtr Root, int & Loc)
{
// Skip if already done
if (Root->IsCounted())
return;
string::size_type x;
StringInt In;
NodeSPtr p = Root;
In.s = Root->GetChildChars();
if (StrSet.find(In) == StrSet.end())
{
// Not already in set of possible child chars for a node, so add it
In.i = Loc++; // Address in the final output array
StrSet.insert(In);
}
// Recursively do the child nodes
for(x = 0; x < In.s.length(); ++x)
{
char c = In.s[x];
NodeSPtr q = p->FindChild(c);
if (q)
MakeChildBitMap(StrSet, q, Loc);
}
Root->SetCounted();
}
// Constants defining bit positions of node data
// Number of bits to represent the index of the child char pattern in the final child bitmap array,
const int BITS_CHILD_PATT_INDEX = 14;
// Number of bits to represent index of where the child pointers start for this node in
// the Child map array and its bit position
const int BITS_CHILD_MAP_INDEX = 18;
const int SHIFT_CHILD_MAP_INDEX = BITS_CHILD_PATT_INDEX;
// Bit positions of word ending indicator and indicator for number of word endings for this + child nodes is >= 256
const int SHIFT_WORD_ENDING_BIT = SHIFT_CHILD_MAP_INDEX + BITS_CHILD_MAP_INDEX;
const int SHIFT_LARGE_ENDING_BIT = SHIFT_WORD_ENDING_BIT + 1;
/**********************************************************************************
* Create the arrays of data that will be output
*/
void CreateArrays(NodeSPtr Root, StringIntSet_t & StrSet, StringOfInts & ChildAddrs, Uint64Vect & NodeData, UintVect & NodeEnds)
{
NodeMap_t::iterator Itc;
StringInt Tmp;
StringOfInts Chld;
// Find children in the child pattern array
Tmp.s= Root->GetChildChars();
StringIntSet_t::iterator Its = StrSet.find(Tmp);
// Make a 'string' of pointers to the children
for(Itc = Root->ChildBegin(); Itc != Root->ChildEnd(); ++Itc)
{
int i = Itc->second->GetAddr();
Chld += i;
}
// Find where in pointer array the child pointer string is
StringOfInts::size_type x = ChildAddrs.find(Chld);
if (x == StringOfInts::npos)
{
// Not found, add it
x = ChildAddrs.length();
ChildAddrs += Chld;
}
// Val will contain the final node data
uint64_t Val = Its->i;
if (Val >= (1 << BITS_CHILD_PATT_INDEX))
{
char Tmp[20];
snprintf(Tmp, sizeof Tmp, "%u", Its->i);
throw string("Not enough bits for child pattern index value of ") + Tmp + " for " +
Its->s + " (BITS_CHILD_PATT_INDEX too small)";
}
if (x >= (1 << BITS_CHILD_MAP_INDEX))
{
char Tmp[20];
snprintf(Tmp, sizeof Tmp, "%lu", x);
throw string("Not enough bits for child map index value of ") + Tmp + " for " +
Its->s + " (BITS_CHILD_MAP_INDEX too small)";
}
Val |= x << SHIFT_CHILD_MAP_INDEX;
if (Root->IsEnd())
Val |= uint64_t(1) << SHIFT_WORD_ENDING_BIT;
if (Root->GetNumEnds() >= 256)
Val |= uint64_t(1) << SHIFT_LARGE_ENDING_BIT;
// Make sure output arrays are big enough
if (Root->GetAddr() > NodeData.size())
{
NodeData.resize(Root->GetAddr()+1, 4000000000);
NodeEnds.resize(Root->GetAddr()+1, 4000000000);
}
// Save the node data and number of word endings for the node
NodeData[Root->GetAddr()] = Val;
NodeEnds[Root->GetAddr()] = Root->GetNumEnds();
// Now do the children
for(Itc = Root->ChildBegin(); Itc != Root->ChildEnd(); ++Itc)
{
CreateArrays(Itc->second, StrSet, ChildAddrs, NodeData, NodeEnds);
}
}
/**********************************************************************************
* Output the data as a binary file.
*/
static int OutputBinary(ostream *Out, const string & ChkFile, const string & CharSet, StringIntSet_t & StrSet, //NodeSPtr & Root,
StringOfInts & ChildAddrs, Uint64Vect & NodeData, UintVect & NodeEnds, StringIntVect_t & Ranks)
{
int OutputSize;
unsigned int FewEndStart = 2000000000;
unsigned int i;
unsigned int Index;
unsigned short u;
TrieCheck h;
for(Index = 0; Index < NodeData.size(); ++Index)
{
uint64_t v = NodeData[Index];
if ((FewEndStart >= 2000000000) && !(v & (uint64_t(1) << SHIFT_LARGE_ENDING_BIT)))
{
FewEndStart = Index;
break;
}
}
// Header words
unsigned int NumWordEnd;
const unsigned int MAGIC = 'z' + ('x'<< 8) + ('c' << 16) + ('v' << 24);
Out->write((char *)&MAGIC, sizeof MAGIC); // Write magic
h(&MAGIC, sizeof MAGIC);
OutputSize = sizeof MAGIC;
i = NodeData.size();
Out->write((char *)&i, sizeof i); // Write number of nodes
h(&i, sizeof i);
OutputSize += sizeof i;
i = ChildAddrs.size();
if (NodeData.size() > numeric_limits<unsigned int>::max())
i |= 1<<31;
Out->write((char *)&i, sizeof i); // Write number of child location entries & size of each entry
h(&i, sizeof i);
OutputSize += sizeof i;
i = Ranks.size();
Out->write((char *)&i, sizeof i); // Write number of ranks
h(&i, sizeof i);
OutputSize += sizeof i;
NumWordEnd = (NodeData.size() + 7) / 8;
Out->write((char *)&NumWordEnd, sizeof NumWordEnd); // Write number of word endings
h(&NumWordEnd, sizeof NumWordEnd);
OutputSize += sizeof NumWordEnd;
i = StrSet.size();
Out->write((char *)&i, sizeof i); // Write size of of child bitmap data
h(&i, sizeof i);
OutputSize += sizeof i;
unsigned int BytePerEntry = (CharSet.length() + 7) / 8;
Out->write((char *)&BytePerEntry, sizeof BytePerEntry); // Write size of each child bitmap
h(&BytePerEntry, sizeof BytePerEntry);
OutputSize += sizeof BytePerEntry;
Out->write((char *)&FewEndStart, sizeof FewEndStart); // Write number of large end counts
h(&FewEndStart, sizeof FewEndStart);
OutputSize += sizeof FewEndStart;
i = NodeData.size();
Out->write((char *)&i, sizeof i); // Write number of end counts
h(&i, sizeof i);
OutputSize += sizeof i;
i = CharSet.length();
Out->write((char *)&i, sizeof i); // Write size of character set
h(&i, sizeof i);
OutputSize += sizeof i;
// Output array of node data
unsigned char *WordEnds = new unsigned char[NumWordEnd];
unsigned char v = 0;
unsigned int z = 0;
int y = 0;
for(Index = 0; Index < NodeData.size(); ++Index)
{
i = NodeData[Index];
Out->write((char *)&i, sizeof i);
h(&i, sizeof i);
if (NodeData[Index] & (uint64_t(1) << SHIFT_WORD_ENDING_BIT))
v |= 1 << y;
if (++y >= 8)
{
WordEnds[z++] = v;
y = 0;
v = 0;
}
}
while(z < NumWordEnd)
{
WordEnds[z++] = v;
v = 0;
}
OutputSize += Index * sizeof i;
// Output array of node pointers
for(Index = 0; Index < ChildAddrs.size(); ++Index)
{
i = ChildAddrs[Index];
Out->write((char *)&i, sizeof i);
h(&i, sizeof i);
}
OutputSize += Index * sizeof i;
// Output ranks
for(Index = 0; Index < Ranks.size(); ++Index)
{
i = Ranks[Index].i;
if (i >= (1 << 15))
{
i -= 1 << 15;
i /= 4;
if (i >= (1 << 15))
i = (1 << 15) - 1;
i |= 1 << 15;
}
if (i > numeric_limits<unsigned short>::max())
i = numeric_limits<unsigned short>::max();
u = i;
Out->write((char *)&u, sizeof u);
h(&u, sizeof u);
}
OutputSize += Index * sizeof u;
// Output word end bit markers
Out->write((char *)WordEnds, NumWordEnd);
h(WordEnds, NumWordEnd);
OutputSize += NumWordEnd;
delete WordEnds;
StringIntSet_t::iterator Its;
string Str;
unsigned char Buf[8];
// Get the items from StrSet ordered by the index
StrIntPtrVect_t SetPtrs;
SetPtrs.resize(StrSet.size());
for(Its = StrSet.begin(); Its != StrSet.end(); ++Its)
{
StringInt *p = Its->Self();
if (p->i >= StrSet.size())
throw "Bad index";
SetPtrs[p->i] = p;
}
// Output child bitmap
for(Index = 0; Index < SetPtrs.size(); ++Index)
{
string::size_type z, y;
StringInt *p;
memset(Buf, 0, sizeof Buf);
p = SetPtrs[Index];
Str = p->s;
for(z = 0; z < Str.length(); ++z)
{
y = CharSet.find(Str[z]);
if (y != string::npos)
{
Buf[y/8] |= 1 << (y & 7);
}
}
Out->write((char *)Buf, BytePerEntry);
h(Buf, BytePerEntry);
}
OutputSize += Index * BytePerEntry;
unsigned char c;
for(Index = 0; Index < FewEndStart; ++Index)
{
i = NodeEnds[Index] >> 8;
if (i >= 256)
c = 0;
else
c = i;
Out->write((char *)&c, 1);
h(&c, 1);
}
OutputSize += Index * sizeof c;
for(Index = 0; Index < NodeEnds.size(); ++Index)
{
c = NodeEnds[Index];
Out->write((char *)&c, 1);
h(&c, 1);
}
OutputSize += Index * sizeof c;
Out->write(CharSet.c_str(), CharSet.length());
h(CharSet.c_str(), CharSet.length());
OutputSize += CharSet.length();
if (!ChkFile.empty())
{
// Write the checksum file
TrieCheck::Check_t x = h.Result();
ofstream f(ChkFile);
f << "static const unsigned char WordCheck[" << sizeof x << "] =\n{\n ";
unsigned char *c = reinterpret_cast<unsigned char *>(&x);
for(Index = 0; Index < sizeof x; ++Index, ++c)
{
if (Index)
f << ',';
f << int(*c);
}
f << "\n};\n";
f << "#define WORD_FILE_SIZE " << OutputSize << endl;
f << "#define ROOT_NODE_LOC 0\n"
"#define BITS_CHILD_PATT_INDEX " << BITS_CHILD_PATT_INDEX << "\n"
"#define BITS_CHILD_MAP_INDEX " << BITS_CHILD_MAP_INDEX << "\n"
"#define SHIFT_CHILD_MAP_INDEX BITS_CHILD_PATT_INDEX\n"
"#define SHIFT_WORD_ENDING_BIT (SHIFT_CHILD_MAP_INDEX + BITS_CHILD_MAP_INDEX)" << endl;
f.close();
}
return OutputSize;
}
int OutputTester(ostream *Out, bool /*Cmnts*/, StringIntVect_t & Ranks)
{
unsigned int Index;
string Pwd;
for(Index = 01; Index < Ranks.size(); ++Index)
{
unsigned int v = Ranks[Index].i;
Pwd = Ranks[Index].s;
string::size_type x = Pwd.find(':');
if (x != string::npos)
Pwd.erase(0, x+1);
*Out << Pwd.c_str() << " ";
for(x = Pwd.length(); x < 16; ++x)
*Out << ' ';
*Out << log(v*1.0) / log(2.0) << " " << v << '\n';
}
return Index;
}
const int LINE_OUT_LEN = 160;
/**********************************************************************************
* Output the data as C source.
*/
int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t & StrSet, NodeSPtr & Root,
StringOfInts & ChildAddrs, Uint64Vect & NodeData, UintVect & NodeEnds, StringIntVect_t & Ranks)
{
unsigned int Index;
int OutputSize;
if (Cmnts)
*Out << "#define ND(e,c,b) (c<<" << SHIFT_CHILD_MAP_INDEX << ")|b\n";
// Output array of node data
*Out << "#define ROOT_NODE_LOC 0\n"
"#define BITS_CHILD_PATT_INDEX " << BITS_CHILD_PATT_INDEX << "\n"
"#define BITS_CHILD_MAP_INDEX " << BITS_CHILD_MAP_INDEX << "\n"
"#define SHIFT_CHILD_MAP_INDEX BITS_CHILD_PATT_INDEX\n"
"#define SHIFT_WORD_ENDING_BIT (SHIFT_CHILD_MAP_INDEX + BITS_CHILD_MAP_INDEX)\n"
"static const unsigned int DictNodes[" << NodeData.size() << "] =\n{";
OutputSize = NodeData.size() * sizeof(unsigned int);
int x = 999;
unsigned int FewEndStart = 2000000000;
for(Index = 0; Index < NodeData.size(); ++Index)
{
uint64_t v;
x += 11;
if (x > LINE_OUT_LEN)
{
*Out << "\n ";
x=0;
}
v = NodeData[Index];
v &= (uint64_t(1) << SHIFT_WORD_ENDING_BIT) - 1;
if (Cmnts)
{
uint64_t i;
i = (v >> SHIFT_WORD_ENDING_BIT) & 3;
*Out << "ND(" << i << ',';
i= (v >> SHIFT_CHILD_MAP_INDEX) & ((1<<BITS_CHILD_MAP_INDEX)-1);
*Out << i << ',';
if (i < 10000) *Out << ' ';
if (i < 1000) *Out << ' ';
if (i < 100) *Out << ' ';
if (i < 10) *Out << ' ';
i = v & ((1<<BITS_CHILD_PATT_INDEX)-1);
*Out << i << ")";
if (Index < (NodeData.size()-1))
{
*Out << ',';
if (i < 1000) *Out << ' ';
if (i < 100) *Out << ' ';
if (i < 10) *Out << ' ';
}
}
else
{
*Out << v;
if (Index < (NodeData.size()-1))
{
*Out << ',';
if (v < 1000000000) *Out << ' ';
if (v < 100000000) *Out << ' ';
if (v < 10000000) *Out << ' ';
if (v < 1000000) *Out << ' ';
if (v < 100000) *Out << ' ';
if (v < 10000) *Out << ' ';
if (v < 1000) *Out << ' ';
if (v < 100) *Out << ' ';
if (v < 10) *Out << ' ';
}
}
if ((FewEndStart >= 2000000000) && !(NodeData[Index] & (uint64_t(1) << SHIFT_LARGE_ENDING_BIT)))
FewEndStart = Index;
}
*Out << "\n};\n";
unsigned int Len = ((NodeData.size() + 7) / 8);
OutputSize += Len;
x = 999;
*Out << "static unsigned char WordEndBits[" << Len << "] =\n{";
Index = 0;
unsigned int v = 0;
unsigned int y = 0;
unsigned int z = 0;
while(z < Len)
{
if (Index < NodeData.size())
{
if (NodeData[Index] & (uint64_t(1) << SHIFT_WORD_ENDING_BIT))
v |= 1 << y;
}
if (++y >= 8)
{
x += 4;
if (x > LINE_OUT_LEN)
{
*Out << "\n ";
x = 0;
}
*Out << v;
if (++z < Len)
{
*Out << ',';
if (v < 100) *Out << ' ';
if (v < 10) *Out << ' ';
}
y = 0;
v = 0;
}
++Index;
}
*Out << "\n};\n";
// Output array of node pointers
*Out << "static const unsigned ";
if (NodeData.size() > numeric_limits<unsigned short>::max())
{
*Out << "int";
x = sizeof(unsigned int);
}
else
{
*Out << "short";
x = sizeof(unsigned short);
}
*Out << " ChildLocs[" << ChildAddrs.size() << "] =\n{";
OutputSize += x * ChildAddrs.size();
x = 999;
for(Index = 0; Index < ChildAddrs.size(); ++Index)
{
int v;
x += 6;
if (x > LINE_OUT_LEN)
{
*Out << "\n ";
x=0;
}
v = ChildAddrs[Index];
*Out << v;
if (Index < (ChildAddrs.size()-1))
{
*Out << ',';
if (v < 10000) *Out << ' ';
if (v < 1000) *Out << ' ';
if (v < 100) *Out << ' ';
if (v < 10) *Out << ' ';
}
}
*Out << "\n};\n";
// Output the rank of the words
*Out << "static const unsigned short Ranks[" << Ranks.size() << "] =\n{";
OutputSize += Ranks.size() * sizeof(unsigned short);
x = 999;
bool TooBig = false;
if (Cmnts)
{
*Out << "\n";
for(Index = 0; Index < Ranks.size(); ++Index)
{
unsigned int v;
*Out << " ";
v = Ranks[Index].i;
if (v >= (1 << 15))
{
v -= 1 << 15;
v /= 4;
if (v >= (1 << 15))
{
TooBig = true;
v = (1 << 15) - 1;
}
v |= 1 << 15;
}
if (v > numeric_limits<unsigned short>::max())
v = numeric_limits<unsigned short>::max();
*Out << v;
if (Index < (Ranks.size()-1))
{
*Out << ',';
if (v < 10000) *Out << ' ';
if (v < 1000) *Out << ' ';
if (v < 100) *Out << ' ';
if (v < 10) *Out << ' ';
}
*Out << " // " << Ranks[Index].s.c_str() << '\n';
}
}
else
{
for(Index = 0; Index < Ranks.size(); ++Index)
{
unsigned int v;
x += 6;
if (x > LINE_OUT_LEN)
{
*Out << "\n ";
x=0;
}
v = Ranks[Index].i;
if (v >= (1 << 15))
{
v -= 1 << 15;
v /= 4;
if (v >= (1<<15))
{
TooBig = true;
v = (1 << 15) - 1;
}
v |= 1 << 15;
}
if (v > numeric_limits<unsigned short>::max())
v = numeric_limits<unsigned short>::max();
*Out << v;
if (Index < (Ranks.size()-1))
{
*Out << ',';
if (v < 10000) *Out << ' ';
if (v < 1000) *Out << ' ';
if (v < 100) *Out << ' ';
if (v < 10) *Out << ' ';
}
}
}
*Out << "\n};\n";
if (TooBig)
{
unsigned int v = ((1<<15) - 1) * 4 + (1<<15);
cout << "// Word ranks too large, value restricted to " << v << endl;
}
unsigned int BytePerEntry = (CharSet.length() + 7) / 8;
*Out << "#define SizeChildMapEntry " << BytePerEntry << '\n';
*Out << "static const unsigned char ChildMap[" << StrSet.size() << '*' << BytePerEntry << "] =\n{";
OutputSize += StrSet.size() * BytePerEntry * sizeof(unsigned char);
StringIntSet_t::iterator Its;
string Str;
unsigned char Buf[8];
// Get the items from StrSet ordered by the index
StrIntPtrVect_t SetPtrs;
SetPtrs.resize(StrSet.size());
for(Its = StrSet.begin(); Its != StrSet.end(); ++Its)
{
StringInt *p = Its->Self();
if (p->i >= StrSet.size())
{
cout << "p->i=" << p->i << " " << p->s.c_str() << endl;
throw "Bad index";
}
SetPtrs[p->i] = p;
}
x = 999;
for(Index = 0; Index < SetPtrs.size(); ++Index)
{
string::size_type z, y;
StringInt *p;
memset(Buf, 0, sizeof Buf);
if (x > LINE_OUT_LEN)
{
*Out << "\n ";
x = 4*BytePerEntry;
}
p = SetPtrs[Index];
Str = p->s;
for(z = 0; z < Str.length(); ++z)
{
y = CharSet.find(Str[z]);
if (y != string::npos)
{
Buf[y/8] |= 1 << (y & 7);
}
}
for(z = 0; z < BytePerEntry; ++z)
{
y = Buf[z] & 0xFF;
*Out << y;
if (z < (BytePerEntry-1))
*Out << ',';
else
{
if (Index < (SetPtrs.size() - 1))
*Out << ", ";
}
if (y < 100)
*Out << ' ';
if (y < 10)
*Out << ' ';
x += 4;
}
if (Cmnts)
{
*Out << " // " << p->i << ": " << Str;
x = 999;
}
}
*Out << "\n};" << endl;
// Output the top 8 bits of the node word endings count. Since node with >255 endings have
// been placed at the begining, and ther are not too many of them the array is fairly small.
*Out << "#define NumLargeCounts " << FewEndStart << "\n";
*Out << "static const unsigned char EndCountLge[" << FewEndStart << "] =\n{";
OutputSize += FewEndStart * sizeof(unsigned char);
x = 999;
for(Index = 0; Index < FewEndStart; ++Index)
{
unsigned int v;
x += 4;
if (x > LINE_OUT_LEN)
{
*Out << "\n ";
x=0;
}
v = NodeEnds[Index] >> 8;
if (v >= 256)
v = 0;
*Out << v;
if (Index < (FewEndStart-1))
{
*Out << ',';
if (v < 100) *Out << ' ';
if (v < 10) *Out << ' ';
}
}
*Out << "\n};\n";
// Output all the word ending counts. For the first few nodes this is just the lower 8 bits of
// the value. For the rest each entry contains the whole count. The split between lower and
// upper halves of the value for the first few nodes allows bytes arrays to be used, so saving
// memory.
*Out << "static const unsigned char EndCountSml[" << NodeEnds.size() << "] =\n{";
OutputSize += NodeEnds.size() * sizeof(unsigned char);
x = 999;
for(Index = 0; Index < NodeEnds.size(); ++Index)
{
unsigned int v;
x += 4;
if (x > LINE_OUT_LEN)
{
*Out << "\n ";
x=0;
}
v = NodeEnds[Index] & 255;
*Out << v;
if (Index < (NodeEnds.size()-1))
{
*Out << ',';
if (v < 100) *Out << ' ';
if (v < 10) *Out << ' ';
}
}
*Out << "\n};\n";
// Finally output the used characters.
*Out << "static const char CharSet[" << CharSet.length()+1 << "] = \"";
OutputSize += CharSet.length() * sizeof(char);
for(Index = 0; Index < CharSet.length(); ++Index)
{
char c = CharSet[Index];
if ((c == '\\') || (c == '"'))
*Out << '\\';
*Out << c;
}
*Out << "\";" << endl;
*Out << "#define ROOT_NODE_LOC " << Root->GetAddr() << "\n";
return OutputSize + sizeof(unsigned int);
}
enum { OUT_C_CODE, OUT_BINARY, OUT_TESTER };
/**********************************************************************************
*/
int main(int argc, char *argv[])
{
int i;
int MaxRank = 999999999;
int OutType = OUT_C_CODE;
bool Verbose = false;
bool Comments = false;
string FileName, HashFile;
char *OutFile = 0;
EntryMap_t Entries;
FileInfo InInfo[10];
int NumFiles = 0;
MinLength = 999;
try
{
for(i = 1; i < argc; ++i)
{
FileName = argv[i];
if (FileName == "-b")
{
// Output a binary file to stdout or file
OutType = OUT_BINARY;
continue;
}
if (FileName == "-t")
{
// Output a tester file to stdout or file
OutType = OUT_TESTER;
continue;
}
if (FileName == "-c")
{
// Add comments to the output (if text)
Comments = true;
continue;
}
if (FileName == "-o")
{
// Give output file
if (++i < argc)
OutFile = argv[i];
continue;
}
if (FileName == "-h")
{
// Give crc header output file
if (++i < argc)
HashFile = argv[i];
continue;
}
if (FileName == "-r")
{
// Ignore words with too high rank
if (++i < argc)
{
char *p=0;
MaxRank = strtol(argv[i], &p, 0);
if ((MaxRank < 1000) || *p)
MaxRank = 999999999;
continue;
}
}
if (FileName == "-v")
{
Verbose = true;
continue;
}
if (FileName[0] == '-')
{
cerr << "Usage: " << argv[0] << " [ -c ] [ -b | -t ] [ -o Ofile ] [ -h Hfile ] Files...\n"
"Where:\n"
" -b Generate a binary output file\n"
" -t Generate a test file for testing zxcvbn\n"
" -c Add comments to output file if C code mode\n"
" -r number Ignore words with rank greater than number (must be >=1000)\n"
" -v Additional information output\n"
" -h Hfile Write file checksum to file Hfile as C code (for -b mode)\n"
" -o Ofile Write output to file Ofile\n"
" Files The dictionary input files to read\n"
" If the -o option is not used, output is written to stdout\n"
" if the -b option is not used, output is in the form of C source code\n"
<< endl;
return 1;
}
ReadInputFile(FileName, InInfo[NumFiles], MaxRank);
if (NumFiles < int(sizeof InInfo / sizeof InInfo[0] - 1))
++NumFiles;
}
CombineWordLists(Entries, InInfo, NumFiles);
if (Verbose)
{
if (!OutFile && (OutType == OUT_C_CODE))
cout << "/*\n";
for(i = 0; i < NumFiles; ++i)
{
FileInfo *Fi = InInfo + i;
cout << "Read input file " << Fi->Name << endl;
cout << " Input words " << Fi->Words << endl;
cout << " Used words " << Fi->Used << endl;
cout << " Unused " << Fi->BruteIgnore <<
" Bruteforce compare, " << Fi->Accented <<
" Accented char, " << Fi->Dups << " Duplicates" << endl;
}
}
bool InputCharSet[256];
NodeSPtr Root(new Node);
// Initially charset of used chracters is empty
memset(InputCharSet, 0, sizeof InputCharSet);
// Add words to the trie with root in Root
ProcessEntries(Root, Entries, InputCharSet);
// Get some interesting info
int NumEnds = Root->CalcEndings();
int Hi = Root->CalcHeight();
int NumNodes = Root->NodeCount();
if (Verbose)
{
cout << "Max word length = " << MaxLength << endl;
cout << "Min word length = " << MinLength << endl;
cout << "Num input chars = " << NumChars << endl;
cout << "Num input words = " << NumInWords << endl;
cout << "Duplicate words = " << NumDuplicate;
cout << "Number of Ends = " << NumEnds << endl;
cout << "Number of Nodes = " << NumNodes << endl;
cout << "Trie height = " << Hi << endl;
}
// Store the alphabetical ordering of the input words
i = 0;
ScanTrieForOrder(Entries, i, Root, string());
if (Verbose)
cout << "Trie Order = " << i << endl;
int InputOrder = i;
// Reduce the Trie
ReduceTrie(Root);
// Output some interesting information
NumNodes = Root->NodeCount();
int ReduceEnds = Root->CalcEndings();
if (Verbose)
{
cout << "After reduce:\n";
cout << "Number of Ends = " << ReduceEnds << endl;
cout << "Number of Nodes = " << NumNodes << endl;
}
// Check reduction was OK
StringIntVect_t Ranks;
int CheckEnds = CheckReduction(Ranks, Root, Entries);
if (Verbose)
cout << "Number of Words = " << CheckEnds << endl;
ChkNum Tst = CheckEntries(Root, string(), Entries);
if (Verbose)
{
cout << "2nd check - Number of valid words = " << Tst.Match << endl;
cout << " Number of invalid words = " << Tst.Err << endl;
}
// Give up if there was an error
if (Tst.Err)
throw "Checks show invalid words after reduction";
if ((Tst.Match != InputOrder) || (ReduceEnds != InputOrder))
throw "Word count changed after reduce";
// Output more info
StringIntSet_t ChildBits;
string CharSet = MakeCharSet(InputCharSet);
if (Verbose)
cout << "Used characters (" << CharSet.length() << "): " << CharSet.c_str() << endl;
// Make a set of all unique child character patterns for the nodes
i=0;
Root->ClearCounted();
MakeChildBitMap(ChildBits, Root, i);
if (Verbose)
cout << "Number of child bitmaps = " << ChildBits.size() << endl;
// Get final node address
Root->CalcAddress();
Uint64Vect NodeData;
UintVect NodeEnds;
StringOfInts ChildAddrs;
// Resize to save library adjusting allocation during data creation
NodeData.resize(NumNodes, 4000000000);
NodeEnds.resize(NumNodes, 4000000000);
CreateArrays(Root, ChildBits, ChildAddrs, NodeData, NodeEnds);
if (Verbose)
{
cout << "Node data array size " << NodeData.size() << endl;
cout << "Child pointer array size " << ChildAddrs.size() << endl;
}
shared_ptr<ofstream> fout;
ostream *Out = &cout;
if (OutFile)
{
fout = shared_ptr<ofstream>(new ofstream);
if (OutType == OUT_BINARY)
fout->open(OutFile, ios_base::trunc | ios_base::binary);
else
fout->open(OutFile, ios_base::trunc);
Out = fout.get();
}
if (!OutFile && (OutType == OUT_C_CODE))
cout << "*/\n";
if (OutType == OUT_BINARY)
i = OutputBinary(Out, HashFile, CharSet, ChildBits, ChildAddrs, NodeData, NodeEnds, Ranks);
else if (OutType == OUT_TESTER)
i = OutputTester(Out, Comments, Ranks);
else
i = OutputCode(Out, Comments, CharSet, ChildBits, Root, ChildAddrs, NodeData, NodeEnds, Ranks);
if (fout)
{
fout->close();
}
}
catch(const char *m)
{
cerr << m << endl;
return 1;
}
catch(string m)
{
cerr << m.c_str() << endl;
return 1;
}
catch(...)
{
cerr << "Unhandled exception" << endl;
return 1;
}
return 0;
}
/**********************************************************************************/