Sign in to follow this  
Lode

Plain old C code review

Recommended Posts

Hi, I've been converting a part of a C++ library I had made in the past, to C (C90 more specifically). However I have experience with C++, but not with C, so a lot of things came as a surprise to me (such as putting all declarations at the top of a scope, putting the struct keyword in front of everything, and no "//" comments). Because I'm inexperienced with C, but I'm thinking about releasing the C-version anyway, to avoid looking too newbie, could someone with experience in C take a look at this code? In other words, a code review :) Because the C++ code had a lot of std::vectors I made some sort of replacement in C for those std::vectors. Is this foolish or not, or what do real C coders do to get dynamic arrays with a convenient syntax? That vector stuff is only a small part of the C code, but to save you from having to look through a lot of C code, I'll only post the vector code here and one other function that uses it to emulate an "std::vector<std::vector<unsigned> >". Anyway here it is: The vectors, vector is generic, uvector is for unsigned ints and ucvector is for unsigned chars:
struct vector
{
  void* data;
  size_t size; /*in groups of bytes depending on type*/
  size_t allocsize; /*in bytes*/
  unsigned typesize; /*sizeof the type you store in data*/
};

void vector_clear(struct vector* p) /*unlike C++'s std::vector, here it just clears the memory*/
{
  p->size = p->allocsize = 0;
  free(p->data);
  p->data = NULL;
}

/*clear and use destructor on elements*/
void vector_cleard(struct vector* p, void dtor(void*))
{
  size_t i;
  for(i = 0; i < p->size; i++)
  dtor(&((char*)(p->data))[i * p->typesize]);
  vector_clear(p);
}

void vector_dtor(struct vector* p)
{
  vector_clear(p);
}

void vector_dtord(struct vector* p, void dtor(void*))
{
  vector_cleard(p, dtor);
}

void vector_resize(struct vector* p, unsigned size)
{
  if(size * p->typesize > p->allocsize)
  {
    p->allocsize = size * p->typesize * 2;
    p->data = realloc(p->data, p->allocsize);
  }
  p->size = size;
}

/*resize and use destructor on elements*/
void vector_resized(struct vector* p, size_t size, void dtor(void*))
{
  size_t i;
  if(size < p->size)
  for(i = size; i < p->size; i++)
  dtor(&((char*)(p->data))[i * p->typesize]);
  
  vector_resize(p, size);
}

void vector_ctor(struct vector* p, unsigned typesize)
{
  p->data = NULL;
  p->size = p->allocsize = 0;
  p->typesize = typesize;
}

void vector_ctorl(struct vector* p, size_t length, unsigned typesize)
{
  vector_ctor(p, typesize);
  vector_resize(p, length);
}

void vector_push_back(struct vector* p, const void* c)
{
  size_t i;
  vector_resize(p, p->size + 1);
  for(i = 0; i < p->typesize; i++) ((char*)p->data)[p->size * p->typesize - p->typesize + i] = ((char*)c)[i];
}

void vector_pop_back(struct vector* p)
{
  vector_resize(p, p->size - 1);
}

void vector_copy(struct vector* p, const struct vector* q) /*copy q to p*/
{
  size_t i;
  vector_resize(p, q->size);
  for(i = 0; i < q->size * p->typesize; i++) ((char*)p->data)[i] = ((char*)q->data)[i];
}

void vector_swap(struct vector* p, struct vector* q) /*they're supposed to have the same typesize*/
{
  size_t tmp;
  void* tmpp;
  tmp = p->size; p->size = q->size; q->size = tmp;
  tmp = p->allocsize; p->allocsize = q->allocsize; q->allocsize = tmp;
  tmpp = p->data; p->data = q->data; q->data = tmpp;
}

void vector_set(struct vector* p, size_t index, void* value)
{
  size_t i;
  for(i = 0; i < p->typesize; i++) ((char*)p->data)[index * p->typesize + i] = ((char*)value)[i];
}

const void* vector_get(const struct vector* p, size_t index)
{
  return &((const char*)p->data)[index * p->typesize];
}

void* vector_back(struct vector* p)
{
  return (void*)(&((char*)p->data)[p->size * p->typesize - p->typesize]);
}

/*/////////////////////////////////////////////////////////////////////////////*/

struct uvector
{
  unsigned* data;
  size_t size; /*size in number of unsigned longs*/
  size_t allocsize; /*size in bytes*/
};

void uvector_clear(struct uvector* p) /*unlike C++'s std::vector, here it just clears the memory*/
{
  p->size = p->allocsize = 0;
  free(p->data);
  p->data = NULL;
}

void uvector_dtor(struct uvector* p)
{
  uvector_clear(p);
}

void uvector_resize(struct uvector* p, size_t size)
{
  if(size * sizeof(unsigned) > p->allocsize)
  {
    p->allocsize = size * sizeof(unsigned) * 2;
    p->data = (unsigned*)realloc(p->data, p->allocsize);
  }
  p->size = size;
}

void uvector_resizev(struct uvector* p, size_t size, unsigned value)
{
  size_t oldsize = p->size, i;
  uvector_resize(p, size);
  for(i = oldsize; i < size; i++) p->data[i] = value;
}

void uvector_ctor(struct uvector* p)
{
  p->data = NULL;
  p->size = p->allocsize = 0;
}

void uvector_ctorl(struct uvector* p, size_t length)
{
  uvector_ctor(p);
  uvector_resize(p, length);
}

void uvector_ctorlv(struct uvector* p, size_t length, unsigned value)
{
  uvector_ctor(p);
  uvector_resizev(p, length, value);
}

void uvector_push_back(struct uvector* p, unsigned c)
{
  uvector_resize(p, p->size + 1);
  p->data[p->size - 1] = c;
}

void uvector_pop_back(struct uvector* p)
{
  uvector_resize(p, p->size - 1);
}

void uvector_copy(struct uvector* p, const struct uvector* q) /*copy q to p*/
{
  size_t i;
  uvector_resize(p, q->size);
  for(i = 0; i < q->size; i++) p->data[i] = q->data[i];
}

void uvector_swap(struct uvector* p, struct uvector* q)
{
  size_t tmp;
  unsigned* tmpp;
  tmp = p->size; p->size = q->size; q->size = tmp;
  tmp = p->allocsize; p->allocsize = q->allocsize; q->allocsize = tmp;
  tmpp = p->data; p->data = q->data; q->data = tmpp;
}

unsigned* uvector_back(struct uvector* p)
{
  return &p->data[p->size - 1];
}

/*/////////////////////////////////////////////////////////////////////////////*/

struct ucvector
{
  unsigned char* data;
  size_t size;
  size_t allocsize;
};

void ucvector_clear(struct ucvector* p) /*unlike C++'s std::vector, here it just clears the memory*/
{
  p->size = p->allocsize = 0;
  free(p->data);
  p->data = NULL;
}

void ucvector_dtor(struct ucvector* p)
{
  ucvector_clear(p);
}

void ucvector_resize(struct ucvector* p, size_t size)
{
  if(size > p->allocsize)
  {
    p->allocsize = size * 2;
    p->data = (unsigned char*)realloc(p->data, p->allocsize);
  }
  p->size = size;
}

void ucvector_resizev(struct ucvector* p, size_t size, unsigned char value)
{
  size_t oldsize = p->size, i;
  ucvector_resize(p, size);
  for(i = oldsize; i < size; i++) p->data[i] = value;
}

void ucvector_ctor(struct ucvector* p)
{
  p->data = NULL;
  p->size = p->allocsize = 0;
}

void ucvector_ctorl(struct ucvector* p, size_t length)
{
  ucvector_ctor(p);
  ucvector_resize(p, length);
}

void ucvector_ctorlv(struct ucvector* p, size_t length, unsigned char value)
{
  ucvector_ctor(p);
  ucvector_resizev(p, length, value);
}

void ucvector_push_back(struct ucvector* p, unsigned char c)
{
  ucvector_resize(p, p->size + 1);
  p->data[p->size - 1] = c;
}

void ucvector_pop_back(struct ucvector* p)
{
  ucvector_resize(p, p->size - 1);
}

void ucvector_copy(struct ucvector* p, const struct ucvector* q) /*copy q to p*/
{
  size_t i;
  ucvector_resize(p, q->size);
  for(i = 0; i < q->size; i++) p->data[i] = q->data[i];
}

void ucvector_swap(struct ucvector* p, struct ucvector* q)
{
  size_t tmp;
  unsigned char* tmpp;
  tmp = p->size; p->size = q->size; q->size = tmp;
  tmp = p->allocsize; p->allocsize = q->allocsize; q->allocsize = tmp;
  tmpp = p->data; p->data = q->data; q->data = tmpp;
}

unsigned char* ucvector_back(struct ucvector* p)
{
  return &p->data[p->size - 1];
}
A function that uses the vectors:
/*LZ77-encode the data using a hash table technique to let it encode faster.*/
void encodeLZ77(struct uvector* out, const unsigned char* in, size_t size, unsigned windowSize)
{
  /**generate hash table**/
  /*table represents what would be an std::vector<std::vector<unsigned> > in C++*/
  struct vector table; /*HASH_NUM_VALUES uvectors*/
  struct uvector tablepos1;
  struct uvector tablepos2;
  
  unsigned pos, i;
  
  vector_ctorl(&table, HASH_NUM_VALUES, sizeof(struct uvector));
  for(i = 0; i < HASH_NUM_VALUES; i++)
  {
    struct uvector* v = (struct uvector*)vector_get(&table, i);
    uvector_ctor(v);
  }

  /*remember start and end positions in the tables to searching in*/
  uvector_ctorlv(&tablepos1, HASH_NUM_VALUES, 0);
  uvector_ctorlv(&tablepos2, HASH_NUM_VALUES, 0);
  
  for(pos = 0; pos < size; pos++)
  {
    unsigned length = 0, offset = 0; /*the length and offset found for the current position*/
    unsigned max_offset = pos < windowSize ? pos : windowSize; /*how far back to test*/
    
    unsigned tablepos;
  
    /*/search for the longest string*/
    /*first find out where in the table to start (the first value that is in the range from "pos - max_offset" to "pos")*/
    unsigned hash = getHash(in, size, pos);
    uvector_push_back((struct uvector*)vector_get(&table, hash), pos);
    
    while(((struct uvector*)vector_get(&table, hash))->data[tablepos1.data[hash]] < pos - max_offset) tablepos1.data[hash]++; /*it now points to the first value in the table for which the index is larger than or equal to pos - max_offset*/
    while(((struct uvector*)vector_get(&table, hash))->data[tablepos2.data[hash]] < pos) tablepos2.data[hash]++; /*it now points to the first value in the table for which the index is larger than or equal to pos*/

    for(tablepos = tablepos2.data[hash] - 1; tablepos >= tablepos1.data[hash] && tablepos < tablepos2.data[hash]; tablepos--)
    {
      unsigned backpos = ((struct uvector*)vector_get(&table, hash))->data[tablepos];
      unsigned current_offset = pos - backpos;

      /*test the next characters*/
      unsigned current_length = 0;
      unsigned backtest = backpos;
      unsigned foretest = pos;
      while(foretest < size && in[backtest] == in[foretest] && current_length < MAX_SUPPORTED_DEFLATE_LENGTH) /*maximum supporte length by deflate is max length*/
      {
        if(backpos >= pos) backpos -= current_offset; /*continue as if we work on the decoded bytes after pos by jumping back before pos*/
        current_length++;
        backtest++;
        foretest++;
      }
      if(current_length > length)
      {
        length = current_length; /*the longest length*/
        offset = current_offset; /*the offset that is related to this longest length*/
        if(current_length == MAX_SUPPORTED_DEFLATE_LENGTH) break; /*you can jump out of this for loop once a length of max length is found (gives significant speed gain)*/
      }
    }
    
    /**encode it as length/distance pair or literal value**/
    if(length < 3) /*only lengths of 3 or higher are supported as length/distance pair*/
    {
      uvector_push_back(out, in[pos]);
    }
    else
    {
      unsigned j;
      addLengthDistance(out, length, offset);
      /*pos += (length - 1);*/
      for(j = 0; j < length - 1; j++)
      {
        pos++;
        uvector_push_back((struct uvector*)vector_get(&table, getHash(in, size, pos)), pos);
      }
    }
  } /*end of the loop through each character of input*/
  
  /*cleanup*/
  for(i = 0; i < table.size; i++)
  {
    struct uvector* v = (struct uvector*)vector_get(&table, i);
    uvector_dtor(v);
  }
  vector_dtor(&table);
  uvector_dtor(&tablepos1);
  uvector_dtor(&tablepos2);
}
Thanks! Hopefully such a post is allowed on GDNet :) I use it in games at least.

Share this post


Link to post
Share on other sites
Quote:
Original post by longjumper
I guess I don't understand why you wouldn't use a more modern version of C and get rid of those pesky things you mentioned at the top of your post.


This is C90 code. According to both Wikipedia and the #C channel on freenode, a more modern C (= C99) is not widely supported. The gcc compiler itself has a list of important C99 things it doesn't support yet. I don't know why, but the C90 standard appears to be much more portable than the C99 one. And I wouldn't go through this conversion to C if it wasn't for portability. When using C99 I can as well stay with C++.

Share this post


Link to post
Share on other sites
I am still wondering why didn't you just wrote a bunch of wrapper functions to the C++ code, create a header, extern "C" the whole file if __cplusplus is defined,
then write copies of the OO functions where instead of:


returntype object.method(args,...)


you have


returntype function(void* object,args,...)


then implement those functions on a cpp file as calls to the objects, for example:


#include "cobject.h"
void* newobject()
{
return new cobject;
}

void deleteobject(void* object)
{
delete (cobject*) object;
}

returntype function(void* object,args,...)
{
return ((cobject*)object)->function(args,...);
}


No need to lose STL or anything C++ brings.

Hope you get the idea [smile].

Share this post


Link to post
Share on other sites
You can very much see that it was written by a C++ programmer![smile]

I don't think C programmers would even use a dynamically sized array for this. You can just allocate a large array that will be at least as large as the largest possible size the output will compress to, and use that. Since your C-vector grows by a factor of two anyway, your current code will probably use at least as much memory on a file that doesn't compress well. I'd suggest figuring out the worst case and doing one malloc at the start.
If you want to save memory post-compression, then you can allocate an array of the exact size afterwards and copy the output into that.

Another idea is to only allocate one byte more than the input size, prepend a zero byte to the output, and constantly check that you don't exceed the output buffer size during compression. The moment you do exceed that size, you write a one in the first byte and then copy the input bytes directly to the rest of the output, wiping over the parital compression data.
The decompressor then checks the first byte to know whether decompression is necessary at all, or if the second byte onwards already contains the data.

Share this post


Link to post
Share on other sites
Hi

I started with C years ago and then switched to C++, and if I wanted to do something like this, I would do it very much like you did. The only thing I can say is that I would use typedef for the struct, so I didn't have to write as much:)

For examlpe:


typedef struct {
/* ... */
} vector;

void vector_clear(vector* p)
{
/* ... */
}

Share this post


Link to post
Share on other sites
Quote:
Original post by Lode

*** Source Snippet Removed ***

A function that uses the vectors:

*** Source Snippet Removed ***



Looks nice.
If all that "struct" in the typename is not desired you could typedef your vector.

If all of your three implementations are the same you might want to use Macros to make it a bit more generic and weird, and ofcourse impossible to debug.


#include <stdio.h>

#define vectorop(type, parameter1) { type i = parameter1; return i-1; }

unsigned int ucvectorop(unsigned int param)
{
vectorop(unsigned char, param)
}


int main(int argc, char** argv) {

unsigned char a = ucvectorop(0);
printf( "%u", a );
return 0;
}




You would partially loose typesafety, but you would be able to unify the implementation.

hope that helps

Share this post


Link to post
Share on other sites
Quote:
Original post by hydroo
If all that "struct" in the typename is not desired you could typedef your vector.


Quote:
Original post by morgue
The only thing I can say is that I would use typedef for the struct, so I didn't have to write as much:)


Oh, that's very nice to know! Thanks guys!

Share this post


Link to post
Share on other sites
One issue is that you have hooks for destructors, but not copying (unless I've missed something). This breaks the rule of three, however applicable or otherwise it is in the C world. vector_set() doesn't call a destructor, for example, before over-writing an element. Similar things are true of some of the other functions, too.

FYI: I noticed that vector_resize also has a memory leak when realloc fails.

Though I understand your motivation, I think the implementation is inconsistent with C++ thinking due in part to implementation problems, but also to the fact that this just isn't C++ anymore. I would find this interface strange to use in C, where types don't have behaviours. A lot of C programmers actually like this. Imposing a C++ way of thinking on C users might not sit too well with some.

I'm curious how the C community would react to an interface that looks something like the following, however:


#define array_create(type, elems) array_create_impl(sizeof(type), (elems))

void *array_create_impl(size_t type_sz, size_t elems);
void *array_destroy(void *array);
void *array_resize(void *array, size_t elems);
size_t array_length(const void *array);




The idea is that there is no struct holding all the pieces. Instead the details are packed in to the same dynamically allocated block as the array's memory.

So, array_create_impl() might be implemented something like this. I'm not being particularly careful about alignment here, and you may not agree with lowercase macro names, but you'll get the idea:


void *array_create_impl(size_t type_sz, size_t elems)
{
const size_t header_size = 2 * sizeof(size_t);
size_t *base = malloc(header_size + type_sz * elems);
if (base == NULL) return NULL;

base[0] = type_sz;
base[1] = elems;
return base + 2;
}




This allows one to access the array naturally and drastically reduces the amount of casting needed. Certainly I wouldn't expect any more frills than those provided by the 4 functions I've declared, except perhaps for allowing custom allocation and deallocation routines.

Share this post


Link to post
Share on other sites
I kind of like that idea, the_edd, maybe I'll really implement it like that. Only disadvantage is that too free it you must remember to free the bytes before it too so you have to use the array_destroy function, even if your void* array would be passed along to some external place where they've long forgotten that the bytes before it also have stuff in them.

So maybe I'll still keep the structs instead (that also saves all alignment problems), but give all the functions somewhat different names that make it look less C++ like and make it not suggest the rule of three :)

About the memory leak in resize: after looking in man realloc (which says it returns NULL if it failed), should this fix it?

  void vector_resize(vector* p, unsigned size)
{
if(size * p->typesize > p->allocsize)
{
size_t newsize = size * p->typesize * 2;
void* data = realloc(p->data, newsize);
if(data)
{
p->allocsize = newsize;
p->data = data;
p->size = size;
}
}
else p->size = size;
}



EDIT: Ok so I was picking other names than ctor and dtor, I thought about init and ....? So I typed in google: "opposite of init", and one of the first results was a gamedev.net thread about words for the opposite of initialize. Isn't GDNet great :)

[Edited by - Lode on December 30, 2007 6:09:46 PM]

Share this post


Link to post
Share on other sites
Quote:
Original post by Lode
I kind of like that idea, the_edd, maybe I'll really implement it like that. Only disadvantage is that too free it you must remember to free the bytes before it too so you have to use the array_destroy function, even if your void* array would be passed along to some external place where they've long forgotten that the bytes before it also have stuff in them.


That's very true, but it's often the way in C. I can understand you wanting to stick with the struct-based approach, though.

Another thing you might consider would be to add fields to your struct for the constructor/destructor function pointers. This would save you passing them in each time. You'd only have to change things in one place, too, if you wanted to change to different routines.

Quote:

About the memory leak in resize: after looking in man realloc (which says it returns NULL if it failed), should this fix it?

*** Source Snippet Removed ***

Yeah that looks better.

Share this post


Link to post
Share on other sites
Quote:
Original post by the_edd
One issue is that you have hooks for destructors, but not copying (unless I've missed something). This breaks the rule of three, however applicable or otherwise it is in the C world. vector_set() doesn't call a destructor, for example, before over-writing an element. Similar things are true of some of the other functions, too.

FYI: I noticed that vector_resize also has a memory leak when realloc fails.

Though I understand your motivation, I think the implementation is inconsistent with C++ thinking due in part to implementation problems, but also to the fact that this just isn't C++ anymore. I would find this interface strange to use in C, where types don't have behaviours. A lot of C programmers actually like this. Imposing a C++ way of thinking on C users might not sit too well with some.
These are some of the things I also noticed and I very much agree with the above.

So, no comments at all on my idea posted earlier?

Share this post


Link to post
Share on other sites
Quote:
Original post by iMalcSo, no comments at all on my idea posted earlier?


Well I checked the interface of zlib.h and they also do it with just an allocated chunk of memory you have to give to it, very much like you say. To decompress you even have to give it allocated memory of at least the exact size the output will be, which you need to get from other means.

The idea of the first value indicating if it was compressed to become smaller or not
looks fine for just the final input and output of compression and decompression, another very C-way, judging from the other replies :). But here the vectors/dynamic arrays are used for more than just the output buffer of compression though, they're also used for example the vector of vector of values like in the example in the original post, and also somewhere to generate a huffman tree where there are also vectors of some struct that also contain more vectors, I thought maybe that it's better to have one general thing that does it instead of finding a separate system for every case.

Maybe I'll not give these vectors to the final output but just a buffer and a size, after all it's perfectly possible to convert these vectors to a buffer + size, and vica versa, unlike C++ where you can't safely convert from a buffer to an std::vector :)

About placing pointers to the constructor and destructor in the vector, I've been considering it actually, would be nice for a more general purpose version of the vectors :)

Share this post


Link to post
Share on other sites

Create an account or sign in to comment

You need to be a member in order to leave a comment

Create an account

Sign up for a new account in our community. It's easy!

Register a new account

Sign in

Already have an account? Sign in here.

Sign In Now

Sign in to follow this