diff --git a/c/cmf.c b/c/cmf.c new file mode 100644 index 0000000..5ae0460 --- /dev/null +++ b/c/cmf.c @@ -0,0 +1,163 @@ +#include "cmf.h" + +#include +#include // for memcpy + +static bool cmf_unserialize(const unsigned char **data, const unsigned char * const endMessage, unsigned long *result) +{ + assert(data); + assert(*data); + assert(result); + assert(endMessage); + + const unsigned char *ptr = *data; + while (ptr < endMessage) { + unsigned char byte = *ptr++; + *result = (*result << 7) | (byte & 0x7F); + if (byte & 0x80) + *result += 1; + else { + *data = ptr; + return true; + } + } + return false; +} + +static void cmf_serialize(unsigned char **data, unsigned long value) +{ + unsigned char *start = *data; + unsigned char *pos = start; + while (true) { + *pos = (unsigned char) ((value & 0x7F) | (pos != start ? 0x80 : 0x00)); + if (value <= 0x7F) + break; + value = (value >> 7) - 1; + ++pos; + } + *data = pos + 1; + + // reverse + while (pos > start) { + unsigned char tmp = *start; // swap + *start = *pos; + *pos = tmp; + ++start; + --pos; + } +} + +static void cmf_write(unsigned char **data, unsigned int tag, short type) { + assert(type < 8); + if (tag >= 31) { // use more than 1 byte + unsigned char byte = type | 0xF8; // set the 'tag' to all 1s + *data[0] = byte; + *data += 1; + cmf_serialize(data, tag); + } + else { + assert(tag < 32); + unsigned char byte = tag; + byte = byte << 3; + byte += type; + *data[0] = byte; + *data += 1; + } +} + +void cmfbuilder_add_int(unsigned char **ptr, unsigned int tag, int value) +{ + short type; + if (value >= 0) { + type = CMFMT_POSITIVE_NUMBER; + } else { + type = CMFMT_NEGATIVE_NUMBER; + value *= -1; + } + cmf_write(ptr, tag, type); + cmf_serialize(ptr, value); +} + +void cmfbuilder_add_ulong(unsigned char **ptr, unsigned int tag, unsigned long value) +{ + cmf_write(ptr, tag, CMFMT_POSITIVE_NUMBER); + cmf_serialize(ptr, value); +} + +void cmfbuilder_add_bytes(unsigned char **ptr, unsigned int tag, const char *data, int length, enum cmf_message_format fmt) +{ + assert(fmt == CMFMT_STRING_UTF8 || fmt == CMFMT_BYTES); + cmf_write(ptr, tag, fmt); + cmf_serialize(ptr, length); + memcpy(*ptr, data, length); + *ptr += length; +} + +void cmfbuilder_add_bool(unsigned char **ptr, unsigned int tag, bool value) +{ + cmf_write(ptr, tag, value ? CMFMT_BOOL_TRUE : CMFMT_BOOL_FALSE); +} + + +enum cmf_parser_result cmfparser_next(const unsigned char **ptr, const unsigned char * const endMessage, struct cmf_message_parser_token *token) +{ + if (*ptr >= endMessage) + return CMF_DOCUMENT_END; + + unsigned char byte = *ptr[0]; + token->fmt = (enum cmf_message_format)(byte & 0x07); + token->tag = byte >> 3; + if (token->tag == 31) { // the tag is stored in the next byte(s) + unsigned long tag = 0; + *ptr += 1; + bool ok = cmf_unserialize(ptr, endMessage, &tag); + if (!ok || tag > 0xFFFFFFFF) { + *ptr -= 1; + return CMF_PARSER_ERROR; + } + *ptr -= 1; + token->tag = (unsigned int) tag; + } + + unsigned long value = 0; + + switch (token->fmt) { + case CMFMT_POSITIVE_NUMBER: + case CMFMT_NEGATIVE_NUMBER: { + *ptr += 1; + bool ok = cmf_unserialize(ptr, endMessage, &value); + if (!ok) { + *ptr -= 1; + return CMF_PARSER_ERROR; + } + if (token->fmt == CMFMT_NEGATIVE_NUMBER) + token->signed_num = (long) (value * -1); + else + token->big_num = value; + break; + } + case CMFMT_BYTES: + case CMFMT_STRING_UTF8: { + *ptr += 1; + bool ok = cmf_unserialize(ptr, endMessage, &value); + if (!ok) { + *ptr -= 1; + return CMF_PARSER_ERROR; + } + token->begin = *ptr; + token->end = token->begin + value; + if (token->end > endMessage) // The actual value is not included in the message + return CMF_PARSER_ERROR; + *ptr += value; + break; + } + case CMFMT_BOOL_TRUE: + case CMFMT_BOOL_FALSE: + *ptr += 1; + break; + default: + return CMF_PARSER_ERROR; + } + + return CMF_FOUND_TOKEN; +} diff --git a/c/cmf.h b/c/cmf.h new file mode 100644 index 0000000..df668d8 --- /dev/null +++ b/c/cmf.h @@ -0,0 +1,70 @@ +#ifndef _CMF_H +#define _CMF_H + +#include + +enum cmf_message_format { + CMFMT_POSITIVE_NUMBER = 0, + CMFMT_NEGATIVE_NUMBER = 1, + CMFMT_STRING_UTF8 = 2, + CMFMT_BYTES = 3, + CMFMT_BOOL_TRUE = 4, + CMFMT_BOOL_FALSE = 5 + // TODO double +}; + +enum cmf_parser_result { + CMF_FOUND_TOKEN, + CMF_PARSER_ERROR, + CMF_DOCUMENT_END +}; + +/* builder API */ + +/* + * The cmfbuilder_add_* range of methods append data to an existing buffer. + * The amount of data added is relative to the actual value passed because a variable-width byte encoding is being used. + * The 'tag' takes at most 5 bytes. It takes zero bytes if the value is < 30. + * The format (int/string/etc) is encoded and takes 1 byte. + * The actual byte count used for the value is format dependent. + * numbers take up to 9 bytes for a 64-bit value. (notice that negative numbers are multiplied by -1 before being encoded, so -1 is just 1 byte) + * byte-arrays are just copied. Additional byte-count is the length. Also var-encoded. + * booleans are free. No bytes taken. + * + * Please make sure enough bytes are available in the buffer as no effort is being made to avoid appending after the buffer. + */ + +void cmfbuilder_add_int(unsigned char **ptr, unsigned int tag, int value); +void cmfbuilder_add_ulong(unsigned char **ptr, unsigned int tag, unsigned long value); +/* + * Add-bytes allows the caller to specify the format specifically because the compact message format + * supports both byte-arrays as well as utf-8 encoded strings. + * Add the one you want to encoded. Notice that if you pass anything other than CMFMT_STRING_UTF8 or CMFMT_BYTES your + * stream will be corrupted + */ +void cmfbuilder_add_bytes(unsigned char **ptr, unsigned int tag, const char *data, int length, enum cmf_message_format fmt); +void cmfbuilder_add_bool(unsigned char **ptr, unsigned int tag, bool value); + +/* parser API */ + + +/* + * The cmf parser method is essentially a SOX parser that allows really fast parsing and zero data-copy. + * + * The cmfparser_next() method can be called repeatedly until a certain token you wish to find has been located. + * The actual token values are stored in the cmf_message_parser_token struct which can be reused for all calls. + */ +struct cmf_message_parser_token { + int tag; + enum cmf_message_format fmt; + union { + unsigned long big_num; /* Used when fmt is CMFMT_POSITIVE_NUMBER */ + long signed_num; /* Used when fmt is CMFMT_NEGATIVE_NUMBER */ + }; + const unsigned char *begin, *end; /* used for byte arrays and strings */ +}; + +enum cmf_parser_result cmfparser_next(const unsigned char **ptr, const unsigned char * const endMessage, struct cmf_message_parser_token *token); + + +#endif diff --git a/c/test.c b/c/test.c new file mode 100644 index 0000000..e401a5b --- /dev/null +++ b/c/test.c @@ -0,0 +1,151 @@ +#include "cmf.h" + +#ifdef NDEBUG +# error run the test in debug mode otherwise you wont get any results +#endif +#include + +#include +#include + +void basic_test1() +{ + unsigned char buf[100]; + unsigned char *ptr = buf; + struct cmf_message_parser_token token; + enum cmf_parser_result found; + + cmfbuilder_add_int(&ptr, 15, 6512); + assert(ptr > buf); + // printf( "size: %d\n", ptr - buf); + assert(ptr - buf == 3); + assert(buf[0] == 120); + assert((unsigned char) buf[1] == 177); + assert(buf[2] == 112); + + const unsigned char *parsePtr = buf; + found = cmfparser_next(&parsePtr, ptr, &token); + assert(found == CMF_FOUND_TOKEN); + assert(token.tag == 15); + assert(token.fmt == CMFMT_POSITIVE_NUMBER); + assert(token.big_num == 6512); + + found = cmfparser_next(&parsePtr, ptr, &token); + assert(found == CMF_DOCUMENT_END); +} + +void basic_test2() +{ + unsigned char buf[100]; + unsigned char *ptr = buf; + struct cmf_message_parser_token token; + enum cmf_parser_result found; + + cmfbuilder_add_int(&ptr, 129, 6512); + assert(ptr > buf); + // printf( "size: %d\n", ptr - buf); + assert(ptr - buf == 5); + assert((unsigned char) buf[0] == 248); + assert((unsigned char) buf[1] == 128); + assert(buf[2] == 1); + assert((unsigned char) buf[3] == 177); + assert(buf[4] == 112); + + const unsigned char *parsePtr = buf; + found = cmfparser_next(&parsePtr, ptr, &token); + assert(found == CMF_FOUND_TOKEN); + assert(token.tag == 129); + assert(token.fmt == CMFMT_POSITIVE_NUMBER); + assert(token.big_num == 6512); + + found = cmfparser_next(&parsePtr, ptr, &token); + assert(found == CMF_DOCUMENT_END); +} + +void test_types() +{ + unsigned char buf[100]; + unsigned char *ptr = buf; + struct cmf_message_parser_token token; + enum cmf_parser_result found; + + const char *foo = "Föo"; + assert(strlen(foo) == 4); // someone changed encoding of this source file + cmfbuilder_add_bytes(&ptr, 1, foo, 4, CMFMT_STRING_UTF8); + const char *hihi = "hihi"; + cmfbuilder_add_bytes(&ptr, 200, hihi, 4, CMFMT_BYTES); + cmfbuilder_add_bool(&ptr, 3, true); + cmfbuilder_add_bool(&ptr, 40, false); + assert(ptr > buf); + // printf( "size: %d\n", ptr - buf); + assert(ptr - buf == 17); + // string '1' + assert((unsigned char) buf[0] == 10); + assert((unsigned char) buf[1] == 4); // serialized string length + assert((unsigned char) buf[2] == 70); + assert((unsigned char) buf[3] == 195); + assert((unsigned char) buf[4] == 182); + assert((unsigned char) buf[5] == 111); + + // blob '200' + assert((unsigned char) buf[6] == 251); + assert((unsigned char) buf[7] == 128); + assert((unsigned char) buf[8] == 72); + assert((unsigned char) buf[9] == 4); // length of bytearray + assert((unsigned char) buf[10] == 104); // 'h' + assert((unsigned char) buf[11] == 105); // 'i' + assert((unsigned char) buf[12] == 104); // 'h' + assert((unsigned char) buf[13] == 105); // 'i' + + // bool-true '3' + assert((unsigned char) buf[14] == 28); + + // bool-false '40' + assert((unsigned char) buf[15] == 253); + assert((unsigned char) buf[16] == 40); + + const unsigned char *parsePtr = buf; + found = cmfparser_next(&parsePtr, ptr, &token); + assert(found == CMF_FOUND_TOKEN); + assert(token.tag == 1); + assert(token.fmt == CMFMT_STRING_UTF8); + assert(token.begin > buf); + assert(token.begin < ptr); + assert(token.end > buf); + assert(token.end < ptr); + assert(token.end - token.begin == 4); + assert(memcmp(foo, token.begin, 4) == 0); // be careful, no trailing zero! + + found = cmfparser_next(&parsePtr, ptr, &token); + assert(found == CMF_FOUND_TOKEN); + assert(token.tag == 200); + assert(token.fmt == CMFMT_BYTES); + assert(token.begin > buf); + assert(token.begin < ptr); + assert(token.end > buf); + assert(token.end < ptr); + assert(token.end - token.begin == 4); + assert(memcmp(hihi, token.begin, 4) == 0); // be careful, no trailing zero! + + found = cmfparser_next(&parsePtr, ptr, &token); + assert(found == CMF_FOUND_TOKEN); + assert(token.tag == 3); + assert(token.fmt == CMFMT_BOOL_TRUE); + found = cmfparser_next(&parsePtr, ptr, &token); + assert(found == CMF_FOUND_TOKEN); + assert(token.tag == 40); + assert(token.fmt == CMFMT_BOOL_FALSE); + + found = cmfparser_next(&parsePtr, ptr, &token); + assert(found == CMF_DOCUMENT_END); +} + +int main(int argc, char *argv[]) +{ + basic_test1(); + basic_test2(); + test_types(); + + return 0; +} +