diff options
Diffstat (limited to 'tools/lzcomp.c')
-rw-r--r-- | tools/lzcomp.c | 404 |
1 files changed, 404 insertions, 0 deletions
diff --git a/tools/lzcomp.c b/tools/lzcomp.c new file mode 100644 index 000000000..1b7b32a5d --- /dev/null +++ b/tools/lzcomp.c @@ -0,0 +1,404 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stdarg.h> + +#define COMPRESSION_METHODS 72 + +struct command { + unsigned command: 3; + unsigned count: 12; + signed value: 17; +}; + +int main(int, char **); +void error_exit(int, const char *, ...); +void bit_flip(const unsigned char *, unsigned short, unsigned char *); +unsigned char * read_file_into_buffer(const char *, unsigned short *); +void write_commands_to_file(const char *, const struct command *, unsigned, const unsigned char *); +void write_command_to_file(FILE *, struct command, const unsigned char *); +struct command * compress(const unsigned char *, unsigned short *); +struct command * try_compress(const unsigned char *, const unsigned char *, unsigned short *, unsigned); +struct command find_best_copy(const unsigned char *, unsigned short, unsigned short, const unsigned char *, unsigned); +unsigned short scan_forwards(const unsigned char *, unsigned short, const unsigned char *, unsigned short, short *); +unsigned short scan_backwards(const unsigned char *, unsigned short, unsigned short, short *); +struct command find_best_repetition(const unsigned char *, unsigned short, unsigned short); +struct command pick_best_command(unsigned, struct command, ...); +int is_better(struct command, struct command); +short command_size(struct command); +void optimize(struct command *, unsigned short); +void repack(struct command **, unsigned short *); +struct command * select_command_sequence(struct command **, const unsigned short *, unsigned, unsigned short *); +struct command * merge_command_sequences(const struct command *, unsigned short, const struct command *, unsigned short, unsigned short *); +unsigned short compressed_length(const struct command *, unsigned short); + +int main (int argc, char ** argv) { + if (argc < 3) { + fprintf(stderr, "usage: %s <source file> <compressed output>\n", *argv); + return 3; + } + unsigned short size; + unsigned char * file_buffer = read_file_into_buffer(argv[1], &size); + struct command * compressed = compress(file_buffer, &size); + write_commands_to_file(argv[2], compressed, size, file_buffer); + free(file_buffer); + free(compressed); + return 0; +} + +void error_exit (int error_code, const char * error, ...) { + va_list ap; + va_start(ap, error); + fputs("error: ", stderr); + vfprintf(stderr, error, ap); + fputc('\n', stderr); + exit(error_code); +} + +void bit_flip (const unsigned char * data, unsigned short length, unsigned char * result) { + unsigned char new_value, pos; + while (length --) { + new_value = 0; + for (pos = 0; pos < 8; pos ++) new_value |= ((*data >> pos) & 1) << (7 - pos); + *(result ++) = new_value; + data ++; + } +} + +unsigned char * read_file_into_buffer (const char * file, unsigned short * size) { + FILE * fp = fopen(file, "rb"); + if (!fp) error_exit(1, "could not open file %s for reading", file); + unsigned char * buf = malloc(32769); + int rv = fread(buf, 1, 32769, fp); + fclose(fp); + if (rv < 0) error_exit(1, "could not read from file %s", file); + if (rv > 32768) error_exit(1, "file %s is too big", file); + *size = rv; + return buf; +} + +void write_commands_to_file (const char * file, const struct command * commands, unsigned count, const unsigned char * input_stream) { + FILE * fp = fopen(file, "wb"); + if (!fp) error_exit(1, "could not open file %s for writing", file); + while (count --) write_command_to_file(fp, *(commands ++), input_stream); + unsigned char terminator = -1; + if (fwrite(&terminator, 1, 1, fp) != 1) error_exit(1, "could not write terminator to compressed output"); + fclose(fp); +} + +void write_command_to_file (FILE * fp, struct command command, const unsigned char * input_stream) { + if ((!command.count) || (command.count > 1024)) error_exit(2, "invalid command in output stream"); + unsigned char buf[4]; + unsigned char * pos = buf; + int n; + command.count --; + if (command.count < 32) + *(pos ++) = (command.command << 5) + command.count; + else { + *(pos ++) = 224 + (command.command << 2) + (command.count >> 8); + *(pos ++) = command.count; + } + switch (command.command) { + case 1: case 2: + if ((command.value < 0) || (command.value >= (1 << (command.command << 3)))) error_exit(2, "invalid command in output stream"); + for (n = 0; n < command.command; n ++) *(pos ++) = command.value >> (n << 3); + case 0: case 3: + break; + default: + if ((command.value < -128) || (command.value > 32767)) error_exit(2, "invalid command in output stream"); + if (command.value < 0) + *(pos ++) = command.value ^ 127; + else { + *(pos ++) = command.value >> 8; + *(pos ++) = command.value; + } + } + if (fwrite(buf, 1, pos - buf, fp) != (pos - buf)) error_exit(1, "could not write command to compressed output"); + if (command.command) return; + command.count ++; + if (fwrite(input_stream + command.value, 1, command.count, fp) != command.count) error_exit(1, "could not write data to compressed output"); +} + +struct command * compress (const unsigned char * data, unsigned short * size) { + unsigned char * bitflipped = malloc(*size); + bit_flip(data, *size, bitflipped); + struct command * compressed_sequences[COMPRESSION_METHODS]; + unsigned short lengths[COMPRESSION_METHODS]; + unsigned current; + for (current = 0; current < COMPRESSION_METHODS; current ++) { + lengths[current] = *size; + compressed_sequences[current] = try_compress(data, bitflipped, lengths + current, current); + } + free(bitflipped); + struct command * result = select_command_sequence(compressed_sequences, lengths, COMPRESSION_METHODS, size); + for (current = 0; current < COMPRESSION_METHODS; current ++) free(compressed_sequences[current]); + return result; +} + +struct command * try_compress (const unsigned char * data, const unsigned char * bitflipped, unsigned short * length, unsigned flags) { + struct command * commands = malloc(sizeof(struct command) * *length); + memset(commands, -1, sizeof(struct command) * *length); + struct command * current_command = commands; + unsigned short position = 0, previous_data = 0; + unsigned char lookahead = 0, lookahead_flag = (flags >> 3) % 3; + struct command copy, repetition; + while (position < *length) { + copy = find_best_copy(data, position, *length, bitflipped, flags); + repetition = find_best_repetition(data, position, *length); + if (flags & 1) + *current_command = pick_best_command(2, repetition, copy); + else + *current_command = pick_best_command(2, copy, repetition); + *current_command = pick_best_command(2, (struct command) {.command = 0, .count = 1, .value = position}, *current_command); + if (flags & 2) { + if (previous_data && (previous_data != 32) && (previous_data != 1024) && (command_size(*current_command) == current_command -> count)) + *current_command = (struct command) {.command = 0, .count = 1, .value = position}; + } + if (lookahead_flag) { + if (lookahead >= lookahead_flag) + lookahead = 0; + else if (current_command -> command) { + lookahead ++; + *current_command = (struct command) {.command = 0, .count = 1, .value = position}; + } + } + if (current_command -> command) + previous_data = 0; + else + previous_data += current_command -> count; + position += (current_command ++) -> count; + } + optimize(commands, current_command - commands); + repack(&commands, length); + return commands; +} + +struct command find_best_copy (const unsigned char * data, unsigned short position, unsigned short length, const unsigned char * bitflipped, unsigned flags) { + struct command simple = {.command = 7}; + struct command flipped = simple, backwards = simple; + short count, offset; + if (count = scan_forwards(data + position, length - position, data, position, &offset)) + simple = (struct command) {.command = 4, .count = count, .value = offset}; + if (count = scan_forwards(data + position, length - position, bitflipped, position, &offset)) + flipped = (struct command) {.command = 5, .count = count, .value = offset}; + if (count = scan_backwards(data, length - position, position, &offset)) + backwards = (struct command) {.command = 6, .count = count, .value = offset}; + struct command command; + switch (flags / 24) { + case 0: command = pick_best_command(3, simple, backwards, flipped); break; + case 1: command = pick_best_command(3, backwards, flipped, simple); break; + case 2: command = pick_best_command(3, flipped, backwards, simple); + } + if ((flags & 4) && (command.count > 32)) command.count = 32; + return command; +} + +unsigned short scan_forwards (const unsigned char * target, unsigned short limit, const unsigned char * source, unsigned short real_position, short * offset) { + unsigned short best_match, best_length = 0; + unsigned short current_length; + unsigned short position; + for (position = 0; position < real_position; position ++) { + if (source[position] != *target) continue; + for (current_length = 0; (current_length < limit) && (source[position + current_length] == target[current_length]); current_length ++); + if (current_length > 1024) current_length = 1024; + if (current_length < best_length) continue; + best_match = position; + best_length = current_length; + } + if (!best_length) return 0; + if ((best_match + 128) >= real_position) + *offset = best_match - real_position; + else + *offset = best_match; + return best_length; +} + +unsigned short scan_backwards (const unsigned char * data, unsigned short limit, unsigned short real_position, short * offset) { + if (real_position < limit) limit = real_position; + unsigned short best_match, best_length = 0; + unsigned short current_length; + unsigned short position; + for (position = 0; position < real_position; position ++) { + if (data[position] != data[real_position]) continue; + for (current_length = 0; (current_length < limit) && (data[position - current_length] == data[real_position + current_length]); current_length ++); + if (current_length > 1024) current_length = 1024; + if (current_length < best_length) continue; + best_match = position; + best_length = current_length; + } + if (!best_length) return 0; + if ((best_match + 128) >= real_position) + *offset = best_match - real_position; + else + *offset = best_match; + return best_length; +} + +struct command find_best_repetition (const unsigned char * data, unsigned short position, unsigned short length) { + if ((position + 1) >= length) return data[position] ? ((struct command) {.command = 7}) : ((struct command) {.command = 3, .count = 1}); + unsigned char value[2] = {data[position], data[position + 1]}; + unsigned repcount, limit = length - position; + if (limit > 1024) limit = 1024; + for (repcount = 2; (repcount < limit) && (data[position + repcount] == value[repcount & 1]); repcount ++); + struct command result; + result.count = repcount; + if (*value != value[1]) { + if (!*value && (repcount < 3)) return (struct command) {.command = 3, .count = 1}; + result.command = 2; + result.value = ((unsigned) (*value)) | (((unsigned) (value[1])) << 8); + } else if (*value) { + result.command = 1; + result.value = *value; + } else + result.command = 3; + return result; +} + +struct command pick_best_command (unsigned count, struct command command, ...) { + struct command result = command; + va_list ap; + va_start(ap, command); + while (-- count) { + command = va_arg(ap, struct command); + if (is_better(command, result)) result = command; + } + va_end(ap); + return result; +} + +int is_better (struct command new, struct command old) { + if (new.command == 7) return 0; + if (old.command == 7) return 1; + short new_savings = new.count - command_size(new), old_savings = old.count - command_size(old); + return new_savings > old_savings; +} + +short command_size (struct command command) { + short header_size = 1 + (command.count > 32); + if (command.command & 4) return header_size + 1 + (command.value >= 0); + return header_size + command.command[(short []) {command.count, 1, 2, 0}]; +} + +void optimize (struct command * commands, unsigned short count) { + while (count && (commands -> command == 7)) commands ++, count --; + if (count < 2) return; + struct command * end = commands + count; + struct command * next = commands + 1; + while (next < end) { + if (next -> command == 7) goto skip; + if ( + !(commands -> command) && + (command_size(*next) == next -> count) && + ((commands -> count + next -> count) <= 1024) && + ((commands -> count > 32) || ((commands -> count + next -> count) <= 32)) + ) { + commands -> count += next -> count; + next -> command = 7; + goto skip; + } + if (next -> command != commands -> command) goto accept; + switch (commands -> command) { + case 0: + if ((commands -> value + commands -> count) != next -> value) break; + commands -> count += next -> count; + next -> command = 7; + if (commands -> count <= 1024) goto skip; + next -> command = 0; + next -> value = commands -> value + 1024; + next -> count = commands -> count - 1024; + commands -> count = 1024; + break; + case 1: + if (commands -> value != next -> value) break; + case 3: + if ((commands -> count + next -> count) <= 1024) { + commands -> count += next -> count; + next -> command = 7; + goto skip; + } + next -> count = (commands -> count + next -> count) - 1024; + commands -> count = 1024; + break; + } + accept: + commands = next; + skip: + next ++; + } +} + +void repack (struct command ** commands, unsigned short * length) { + struct command * new_commands = malloc(sizeof(struct command) * *length); + struct command * current = new_commands; + unsigned short p; + for (p = 0; p < *length; p ++) if (p[*commands].command != 7) *(current ++) = p[*commands]; + free(*commands); + *commands = new_commands; + *length = current - new_commands; +} + +struct command * select_command_sequence (struct command ** sequences, const unsigned short * lengths, unsigned count, unsigned short * final_length) { + unsigned short min_sequence = 0, min_length = compressed_length(*sequences, *lengths); + unsigned short seq, len; + for (seq = 1; seq < count; seq ++) { + len = compressed_length(sequences[seq], lengths[seq]); + if (len < min_length) { + min_sequence = seq; + min_length = len; + } + } + *final_length = lengths[min_sequence]; + struct command * current = malloc(*final_length * sizeof(struct command)); + memcpy(current, sequences[min_sequence], *final_length * sizeof(struct command)); + struct command * new; + for (seq = 1; seq < count; seq ++) { + new = merge_command_sequences(current, *final_length, sequences[(seq + min_sequence) % count], lengths[(seq + min_sequence) % count], final_length); + free(current); + current = new; + } + return current; +} + +struct command * merge_command_sequences (const struct command * current, unsigned short current_length, const struct command * new, unsigned short new_length, + unsigned short * result_length) { + struct command * result = malloc(sizeof(struct command) * (current_length + new_length)); + struct command * current_command = result; + const struct command * saved_current; + const struct command * saved_new; + unsigned short current_pos, new_pos; + while (current_length) { + if (current -> count == new -> count) { + *(current_command ++) = pick_best_command(2, *(current ++), *(new ++)); + current_length --; + continue; + } + saved_current = current; + saved_new = new; + current_pos = (current ++) -> count; + new_pos = (new ++) -> count; + current_length --; + while (current_pos != new_pos) + if (current_pos < new_pos) { + current_pos += (current ++) -> count; + current_length --; + } else + new_pos += (new ++) -> count; + current_pos = compressed_length(saved_current, current - saved_current); + new_pos = compressed_length(saved_new, new - saved_new); + if (new_pos < current_pos) { + memcpy(current_command, saved_new, sizeof(struct command) * (new - saved_new)); + current_command += new - saved_new; + } else { + memcpy(current_command, saved_current, sizeof(struct command) * (current - saved_current)); + current_command += current - saved_current; + } + } + *result_length = current_command - result; + return result; +} + +unsigned short compressed_length (const struct command * commands, unsigned short count) { + unsigned short current, total = 0; + for (current = 0; current < count; current ++) if (commands[current].command != 7) total += command_size(commands[current]); + return total; +} |