/***** * symbol.cc * Andy Hammerlindl 2002/06/18 * * Creates symbols from strings so that multiple calls for a symbol of * the same string will return an identical object. *****/ #include #include using std::strlen; #include "settings.h" #include "symbol.h" namespace sym { const char USED = 1; const char SKIP = 2; struct symbolRecord { // When a symbol is entered into the table, its hash is computed. If the // corresponding entry in the table is full, this value is incremented until // an empty slot is found. hashplus stores the end value. // Each symbol has a unique hashplus value, even if there is a collision in // the original hashing function. uint hashplus; // Whether the cell of the table is empty, in use, or a "skip" entry due to // a resizing of the table. unsigned char flag; // Pointer to a copy of the string (allocated on the heap). This string // will never be deallocated. Symbols, in essence, last forever. char *s; }; // The table size must be a power of two so that (h % tableSize) can be // replaced by (h & tableMask). 1 << 15 was chosen based on the number of // unique symbols (roughly 4000) which occured in all of the base modules. const size_t SYMBOL_TABLE_BASE_CAPACITY = 1 << 15; symbolRecord baseSymbolTable[SYMBOL_TABLE_BASE_CAPACITY]; symbolRecord *table = baseSymbolTable; size_t tableCapacity = SYMBOL_TABLE_BASE_CAPACITY; uint tableMask = 0; size_t tableSize = 0; symbolRecord &recordByHashplus(uint h) { return table[h & tableMask]; } GCInit symbol::initialize; symbol symbol::nullsym; symbol symbol::initsym; symbol symbol::castsym; symbol symbol::ecastsym; const char *nullsymstr = ""; void initTable() { tableMask = (uint)(tableCapacity - 1); tableSize = 0; // Set every entry to empty. (Is this faster than memsetting the whole // thing?) for (size_t i = 0; i < tableCapacity; ++i) table[i].flag = 0; // The zeroth entry is reserved for the "null" symbol. if (table == baseSymbolTable) { table[0].flag = USED; table[0].s = new char[strlen(nullsymstr) + 1]; strcpy(table[0].s, nullsymstr); ++tableSize; symbol::nullsym.hashplus = 0; symbol::initsym = symbol::opTrans("init"); symbol::castsym = symbol::opTrans("cast"); symbol::ecastsym = symbol::opTrans("ecast"); } } // Hashing constants found experimentally to reduce collision (a little). const uint A = 25191, B = 16342, C = 1746, D = 18326; // Hash the string into an integer. Experimental testing has shown that // hashing only the first few letters seems to be faster than hashing deeper // into the string, even though this approach causes more hash collisions. uint hash(const char *s, size_t len) { uint h = s[0]; if (len == 2) return h; h += A*s[1]; if (len == 3) return h; h += B*s[2]; if (len == 4) return h; h += C*s[3]; if (len == 5) return h; h += D*s[4]; return h+len; } /* Under normal circumstances, the initial table should be large enough for * all of the symbols used and will never be resized. Just in case the * program encounters a large number of distinct symbols, we implement * resizing of the table. */ void resizeTable() { symbolRecord *oldTable = table; size_t oldSize = tableSize; size_t oldCapacity = tableCapacity; tableCapacity *= 4; table = new symbolRecord[tableCapacity]; initTable(); // The null symbol is a special case. table[0] = oldTable[0]; ++tableSize; #if 0 printf("old:\n"); for (size_t i = 0; i < oldCapacity; ++i) { symbolRecord &r = oldTable[i]; if (r.flag != USED) continue; printf(" %u -> %s\n", r.hashplus, r.s); } #endif for (size_t i = 1; i < oldCapacity; ++i) { symbolRecord &r = oldTable[i]; if (r.flag != USED) continue; // Entries that were skipped over when this symbol was entered into the // old hash table may not appear in the same spot in the new hash table. // Put "SKIP" entries in their place, so that the symbol will still be // found. for (uint h = hash(r.s, strlen(r.s)+1); h < r.hashplus; ++h) { symbolRecord &skipr = recordByHashplus(h); if (skipr.flag == 0) skipr.flag = SKIP; } // Enter the symbol in its spot. symbolRecord &newr = recordByHashplus(r.hashplus); assert(newr.flag != USED); newr.flag = USED; newr.hashplus = r.hashplus; newr.s = r.s; ++tableSize; } #if 0 printf("new:\n"); for (size_t i = 0; i < tableCapacity; ++i) { symbolRecord &r = table[i]; if (r.flag != USED) continue; printf(" %u -> %s\n", r.hashplus, r.s); } #endif assert(tableSize == oldSize); // Debugging resize. for (size_t i = 1; i < oldCapacity; ++i) { symbolRecord &r = oldTable[i]; if (r.flag != USED) continue; symbolRecord &newr = recordByHashplus(r.hashplus); assert(newr.hashplus == r.hashplus); assert(newr.flag != 0); assert(newr.flag != SKIP); assert(newr.flag == USED); assert(newr.s = r.s); if (strncmp(r.s, "gensym", 6) != 0) assert(symbol::rawTrans(r.s, strlen(r.s)+1).hashplus == r.hashplus); } #if 0 // Diagnostics. uint empty=0, used=0, skip=0; for (size_t i = 0; i < tableCapacity; ++i) { symbolRecord &r = table[i]; if (r.flag == 0) ++empty; else if (r.flag == USED) ++used; else if (r.flag == SKIP) ++skip; else assert("Unknown flag" == 0); } cout << "Resized symbol table. " << "empty: " << empty << "used: " << used << "skip: " << skip << endl; #endif } symbol symbolize(uint h) { symbol s; s.hashplus = h; return s; } // Handles the insertion of a new symbol into a table the has been resized (or // needs resizing). symbol advancedInsert(const char *s, size_t len) { if (2*tableSize >= tableCapacity) resizeTable(); uint hashplus = hash(s, len); #if 1 assert(s != 0); assert(len > 0); assert(2*tableSize <= tableCapacity); #endif // We know the symbol is not in the table. Just search for the first unused // entry (either empty or a skip entry) and insert there. for (;;) { symbolRecord &r = recordByHashplus(hashplus); if (r.flag != USED) { r.flag = USED; r.s = new char[len]; memcpy(r.s, s, len); assert(r.s[len-1] == '\0'); r.hashplus = hashplus; ++tableSize; assert(2*tableSize <= tableCapacity); return symbolize(hashplus); } ++hashplus; } assert("Unreachable code" == 0); return symbol::nullsym; } symbol symbol::gensym(string s) { // Gensym can be inserted as if it were a normal string not already in the // table. advancedInsert handles this. s = "gensym " + s; return advancedInsert(s.c_str(), s.size() + 1); } symbol symbol::rawTrans(const char *s, size_t len) { uint hashplus = sym::hash(s, len); #if 1 assert(s != 0); assert(len > 0); assert(2*tableSize <= tableCapacity); #endif // Search through the table till we find the symbol already translated or // an empty field. for (;;) { symbolRecord &r = recordByHashplus(hashplus); // Translating pre-existing symbols is more common, so check for it first. if (r.hashplus == hashplus && r.flag == USED && strncmp(r.s, s, len) == 0) { return symbolize(hashplus); } // Then check for an empty entry, in which case the entry is added. if (r.flag == 0) { // Test if the table needs resizing before entering a new symbol, or if // the table has already been resized. In either case, the symbol will // be added to a resized table which may contain skip entries, and a // more involved insertion routine is needed. if (2*tableSize >= SYMBOL_TABLE_BASE_CAPACITY) return advancedInsert(s, len); r.flag = USED; r.s = new char[len]; memcpy(r.s, s, len); assert(r.s[len-1] == '\0'); r.hashplus = hashplus; ++tableSize; assert(2*tableSize <= tableCapacity); return symbolize(hashplus); } // A case where a different symbol is in the spot, continue along the // table. ++hashplus; } assert("Unreachable code" == 0); return symbol::nullsym; } symbol::operator string () const { symbolRecord &r = recordByHashplus(this->hashplus); return (string)r.s; } #ifdef USEGC symbol::operator std::string () const { symbolRecord &r = recordByHashplus(this->hashplus); return (std::string)r.s; } #endif ostream& operator<< (ostream& out, const symbol sym) { symbolRecord &r = recordByHashplus(sym.hashplus); return out << r.s; } } // end namespace sym /* Define all of operator symbols SYM_PLUS, etc. */ #define OPSYMBOL(str, name) \ sym::symbol name = sym::symbol::opTrans(str) #include "opsymbols.h" #undef OPSYMBOL /* Define all of the symbols of the type SYM(name) in selected files. */ #define ADDSYMBOL(name) \ sym::symbol PRETRANSLATED_SYMBOL_##name = sym::symbol::literalTrans(#name) #include "allsymbols.h" #undef ADDSYMBOL