Another old (and disappointing) project, was to quickly compress strings a bit, by not using the complete ascii range and certainly not the whole unicode range.
At first I used this class to save a forms typename as an identifier, using 6 bits per letter. Besides being a bit gratifying in completing the puzzle, the usefullness of this class is... well.... let's say... not so very great. Right now I'm using it for a quick serializer where the use of it on cummalative typenames can make some differences, but probably not much in comparisson to real compression algorithms such as zipping.
Why post it then? Good question. Mainly because I want to be able to find it back when doing such a 'puzzle' again. And who knows, maybe someday this can be further build in a better sorting algorithm, and partly because some of you out there might actually have a use for the bitwise storing used in this class (who knows eh? :D )
namespace Subro.Text
{
using System;
using System.Collections;
///
<summary>
/// Class to create a compacter version of a string (and back).
/// Not the same as a compression algorithm, but a quick way to
/// store a string minimized
/// </summary>
/// <example>
/// Subro.Text.Compactor c = new Subro.Text.CompactorAscii();
/// string t = "This is a test, does it compute?"
/// byte[] b;
/// b = c.Compact(t);
/// t = c.DeCompact(b);
///
/// Type ty = typeof(Form1);
/// c = new Subro.Text.CompactorTypeName();
/// b = c.Compact(ty.FullName);
/// ty = (c as Subro.Text.CompactorTypeName).GetType(b);
/// </example>
public class Compactor{
readonly
string allow;
readonly int bitgroup;
public Compactor(
string AllowedValues)
{
allow = AllowedValues;
bitgroup = (int)Math.Ceiling(Math.Log(allow.Length + 1, 2));
}
/// <summary>
/// Uses the default allowed values: <see cref="CompactStringBase"/>
/// <see cref="ThrowOnInvalidChars"/> is set to false, so other characters
/// are simply ignored
/// </summary>
public Compactor()
: this(CompactStringBase)
{
ThrowOnInvalidChars = false;
}
public
byte[] Compact(
string value)
{
return Compact(value, GetLength(value));
}
public byte[] Compact(string value, uint FixedLength)
{
byte[] b =
new byte[FixedLength];
unchecked{
int pos = 0, bitpos = 0;
foreach (
char ch
in value)
{
int i = allow.IndexOf(ch) + 1;
if (i > 0)
{
i <<= bitpos;
b[pos] += (
byte)i;
if ((bitpos+=bitgroup) >= 8)
{
if (++pos == FixedLength) break;
bitpos -= 8;
if ((i >>= 8) == 0) continue;
b[pos] = (byte)i;
}}
else if (ThrowOnInvalidChars)
throw new Exception("Character is not allowed with the current settings: " + ch.ToString());
} }
return b;
}
///
<summary>
/// The amount of bits one character will take, using the allowed string
/// </summary>
public int BitGroup
{
get { return bitgroup; }
}
///
<summary>
/// The mask that can be used with an AND against data to get (the most right)
/// letter index
/// </summary>
public int Mask
{
get { return ((1 << bitgroup) - 1); }
}
///
<summary>
/// Returns the expected length of the byte array returned when compacting
/// a string with the currently allowed characters
/// This is NOT subtracting invalid characters when <see cref="ThrowOnInvalidChars"/>
/// is disabled
/// </summary>
/// <param name="Value"></param>
/// <returns></returns>
public uint GetLength(
string Value)
{
return (uint)Math.Ceiling(Value.Length * bitgroup / (double)8);
}
///
<summary>
/// Returns the expected length or the resulting string when Decompacting this binary data.
/// This is NOT with subtracting 0 values
/// (PS, the length is the length of characters. Since in .net all strings are unicode,
/// the memory usage will be greater than that)
/// </summary>
/// <param name="b"></param>
/// <returns></returns>
public int GetLength(
Byte[] b)
{
return 8 * b.Length / bitgroup;
}
/// <summary>
/// determines if an exception should be thrown if <see cref="Compact"/> encounters
/// characters that are not in the Allowed string.
/// If this value is false, those chars are simply skipped
/// </summary>
public bool ThrowOnInvalidChars = true;
public
string DeCompact(
byte[] bytes)
{
if (bytes==null || bytes.Length == 0) return null;
char[] chars = new char[GetLength(bytes)];
BitArray ba = new BitArray(bytes);
int charpos = 0;
int max = chars.Length * bitgroup;
for (
int i = 0; i < max;)
{
int index = 0;
for (
int j = 0; j < bitgroup; j++)
{
if(ba[i++])
index+= 1<<j;
}
if (index == 0) break;
chars[charpos++] = allow[index - 1]; }
return new string(chars, 0, charpos);
}
///
<summary>
/// Biggest Allowed values possible for keeping 5 bit compacting
/// </summary>
public const string CompactStringBase =
"4321ABCDEFGHIJKLMNOPQRSTUVWXYZ";
public const string CompactStringForm =
"|%21ABCDEFGHIJKLMNOPQRSTUVWXYZ";
public const string CompactStringVariableName =
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
/// <summary>
/// String with allowed characters for a .net type name (results in 6 bit storage)
/// </summary>
public const string CompactStringTypeName = CompactStringVariableName +
".";
/// <summary>
/// A string with all the normal ascii characters (results in 8 bit storage, so a normal byte and therefor not very useful)
/// </summary>
public static string CompactStringNormalAscii
{
get
{
string s =
"";
for (
int i = 1; i < 256; i++)
s += (char)i;
return s;
} }
}
public
class CompactorTypeName :
Compactor{
public CompactorTypeName()
: base(CompactStringTypeName)
{
}
public
byte[] Compact(
Type t)
{
return Compact(t.FullName);
}
public
Type GetType(
byte[] b)
{
return Type.GetType(DeCompact(b));
}
}
///
<summary>
/// A compacter using the <see cref="CompactStringNormalAscii"/>
/// Not very usefull, you might as well use the Ascii encoding.
/// The length of bytes will simply be the same as the string length.
/// But included the possibility if a compactor range has to be increased quickly
/// without the need for rewriting to much ;-)
/// </summary>
public class CompactorAscii :
Compactor{
public CompactorAscii()
: base(CompactStringNormalAscii)
{
}
}
}
. . .
Example usage:
Subro.Text.Compactor c = new Subro.Text.CompactorAscii();
string t = "This is a test, does it compute?";
byte[] b;
b = c.Compact(t,50);
t = c.DeCompact(b);
Type ty = typeof(Form1);
c = new Subro.Text.CompactorTypeName();
b = c.Compact(ty.FullName);
ty = (c as Subro.Text.CompactorTypeName).GetType(b);
//The above didn't really make a difference
//The class is more intended for known results
//Eg, results of ABCD, without having to worry about
//indexing to numbers yourself (bad example, because
//generally, that would mean a bad design, but had to have some
//sort of example :-/ )
c = new Subro.Text.Compactor("ABCDE");
b = c.Compact("AAAEABBCCCCDDABCAADAAEBCAADDAACCDDAADDCCAAADDBBBDDACDEAAEEE");
t = c.DeCompact(b);
. . .