ecnelises Bonvenon al la malpura mondo

用C写的一个简易JSON Parser

初来乍到的一学期实在是无聊,于是在百团大战中加了计算机协会,就认识了几个计算机和软件专业的人。看见一软件同学在空间上转发的C语言期末项目,顿时生了兴趣。代码是春节期间写完的,可惜这个博客被搁置了太久,现在把代码贴上来,也不算过分吧。

作业的具体内容是:老师给定一个头文件 json.h ,包含了 json 数据结构的定义以及一些相关函数的声明,任务就是实现这些函数,最后提交通过学校的自动化测试。我不是软件的学生,自然没有这个权利,不过手工检验了一下,貌似还没有太大问题。只是最近写一个 web 项目的时候,把其间一个比较“庞大”的 json 文件(也才4KB!)当作测试材料,结果程序直接崩溃了;再截取一小部分,又能运行了。难道是字符串读写复制操作太多?可貌似连 fopen 函数都调用失败了,搞不懂。

老师们给的头文件想必是由 cJSON 的头文件修改而来的,json结构体的定义如下:

typedef struct JSON {
    int type;   /* The type of the item, as above. */

    char *valuestring;  /* The item's string, if type==JSON_STRING */
    int valueint;   /* The item's number, if type==JSON_TRUE||JSON_FLASE;
                      or quantity of members, if type==JSON_ARRAY||JSON_OBJECT */
    double valuedouble; /* The item's number, if type==JSON_NUMBER */

    // the following are created by me
    struct JSON *head; /* Head of array, if type==JSON_ARRAY||JSON_OBJECT */
    struct JSON *last; /* Previous member, if this belongs to an array or object */
    struct JSON *next; /* Next member, if this belongs to an array or object */
    char *name; /* Name of member, if this belongs to an object */
} JSON;

函数众多就不一一列举了,有几个实现起来相对复杂的:

void PrintJSON(JSON *item);
JSON *Duplicate(JSON *item, int recurse);
JSON *ParseJSON(const char *value);```

如果有朋友对 json 不熟悉的话,简单介绍一下:

{
    "name": "ecnelises",
    "school": {
        "primary": "Chenjiawan",
        "junior": "qinghua",
        "senior": "yucai",
        "university": "tongji"
    },
    "randomvalue": [
        1.5,
        18,
        true,
        null
    ]
}

json 的类型有 object、array、string、number、boolean(true、false)、null. array 用方括号 [] 括起来,object是“键值对”(类似C++中的 map),用花括号 {} 括起来。json 是一种简化版的数据结构(相比于庞杂的 xml),在 javascript 和其他地方广泛使用。

第一个要介绍的函数是 Print,顾名思义就是根据既有的 json 打印出规整的内容。这个函数本身的定义很简单:

void PrintJSON(JSON *item)
{
    if (item == NULL){
        return;
    }

    switch (item -> type) {
        case JSON_FALSE:
            printf("false");
            break;
        case JSON_TRUE:
            printf("true");
            break;
        case JSON_NULL:
            printf("null");
            break;
        case JSON_NUMBER:
            printf("%lf", item -> valuedouble);
            break;
        case JSON_STRING:
            printf("\"%s\"", item -> valuestring);
            break;
        case JSON_ARRAY:
            PrintArray(1, item, 1);
            break;
        case JSON_OBJECT:
            PrintObject(1, item, 1);
            break;
        default:
            break;
    }
}

关键在于调用的另外两个函数:PrintArray 和 PrintObject. 引入这两个函数的原因是方便递归。

void PrintArray(int layers, JSON *array, int newline)
{
    if (newline) {
        for (int i = 1; i < layers; ++i) {
            printf("    ");
        }
        printf("[\n");
    }

    JSON *find = array -> head;
    for (int c = 1; c <= array -> valueint; ++c) {
        if (find -> type == JSON_OBJECT) {
            PrintObject(layers + 1, find, 1);
        } else if (find -> type == JSON_ARRAY) {
            PrintArray(layers + 1, find, 1);
        } else {
            for (int i = 1; i < layers + 1; ++i) {
                printf("    ");
            }
            PrintJSON(find);
        }
        if (c < array -> valueint) {
            printf(",");
        }
        printf("\n");
        find = find -> next;
    }

    for (int c = 1; c < layers; ++c) {
        printf("    ");
    }
    printf("]");

    if (layers == 1) {
        printf("\n");
    }
}

void PrintObject(int layers, JSON *object, int newline)
{
    if (newline) {
        for (int i = 1; i < layers; ++i) { // Each calling executes all in the same layer
            printf("    ");
        }
        printf("{\n");
    }

    JSON *find = object -> head;
    for (int c = 1; c <= object -> valueint; ++c) {
        for (int i = 1; i < layers + 1; ++i) {
            printf("    ");
        }
        printf("\"%s\": ", find -> name);

        if (find -> type == JSON_OBJECT) {
            printf("{\n");
            PrintObject(layers + 1, find, 0); // '{' doesn't make a new line
        } else if (find -> type == JSON_ARRAY) {
            printf("[\n");
            PrintArray(layers + 1, find, 0);
        } else {
            PrintJSON(find);
        }
        if (c < object -> valueint) {
            printf(",");
        }
        printf("\n");
        find = find -> next;
    }

    for (int i = 1; i < layers; ++i) {
        printf("    ");
    }

    printf("}");

    if (layers == 1) {
        printf("\n");
    }
}

参数 layers 决定了当前要打印的东西在第几“层”,也就是说前面要有多少层缩进。newline 是为了处理冒号后 { 或 [ 不换行的情况。

下面的函数 Duplicate 是负责复制 json 的,recurse 参数决定是否“深度”复制,我想大概是递归的意思吧。

JSON *Duplicate(JSON *item, int recurse)
{
    JSON *source = item;
    JSON *target = NULL;
    JSON *last = NULL;
    JSON *count;

    while (source != NULL) {
        switch (source -> type) {
            case JSON_TRUE:
                target = CreateTrue();
                break;
            case JSON_FALSE:
                target = CreateFalse();
                break;
            case JSON_NUMBER:
                target = CreateNumber(source -> valuedouble);
                break;
            case JSON_NULL:
                target = CreateNULL();
                break;
            case JSON_STRING:
                target -> valuestring = CopyString(source -> valuestring);
                break;
            case JSON_ARRAY:
                target = CreateArray();

                if (source -> head != NULL && recurse) {
                    target -> head = Duplicate(source -> head, 1);
                } else if (source -> head != NULL && !recurse) {
                    target -> head = source -> head;
                }

                count = target -> head;
                while (count != NULL) {
                    target -> valueint += 1;
                    count = count -> next;
                }
                break;
            case JSON_OBJECT:
                target = CreateObject();
                target -> name = CopyString(source -> name);

                if (source -> head != NULL && recurse) {
                    target -> head = Duplicate(source -> head, 1);
                } else if (source -> head != NULL && !recurse) {
                    target -> head = source -> head;
                }

                count = target -> head;
                while (count != NULL) {
                    target -> valueint += 1;
                    count = count -> next;
                }
                break;
            default:
                break;
        }

        source = source -> next;
        target -> last = last;
        if (last != NULL) {
            last -> next = target;
        } else {
            item = target; // Give the begining position to item
        }
        last = target;
        target = target -> next;
    }

    return item;
}

函数思路简单,但是算不上太好写……用“同步跟踪”的方式从当前 json 的子节点开始,最后再把头补上。

相对而言最复杂的要数 ParseJSON 函数了。基本想法就是先搜索配对的括号,然后递归处理括号里边的内容,直到没括号的时候就根据冒号和逗号分隔开各个字符串然后进行解析。由于源字符串可以不是规整的,所以还得跳过其间的空白。没有学过编译原理,所以程序也不带出错功能,错误的输入结果是未知的。

JSON *ParseJSON(const char *value)
{
    /* We read the whole string char by char,
       and create a new function parse_object.
       We make two tables to record emergency of {} and [].
     */
    int blank_state = 1; // 是否空白
    int flayerl = 1;
    int flayers = 1;
    int lallb = 0, sallb = 0; // 记录所有出现的括号
    int lbrackets = 0, sbrackets = 0; // 记录“第一层”的括号
    unsigned long vallen = strlen(value);
    int *lbracket_table = NULL;
    int *sbracket_table = malloc(sizeof(int) * sbrackets);

    for (int i = 0; i < vallen; ++i) {
        if (value[i] == '{' && flayerl) {
            flayerl = 0;
            ++lbrackets;
            ++lallb;
        } else if (value[i] == '{' && !flayerl) {
            ++lallb;
        } else if (value[i] == '}' && (lallb != lbrackets)) {
            --lallb;
        } else if (value[i] == '}' && (lallb == lbrackets)) {
            flayerl = 1;
            lbracket_table = (int*)realloc(lbracket_table, sizeof(int) * lbrackets);
            lbracket_table[lbrackets - 1] = i;
        } else if (value[i] == '[' && flayers) {
            flayers = 0;
            ++sbrackets;
            ++sallb;
        } else if (value[i] == '[' && !flayers) {
            ++sallb;
        } else if (value[i] == ']' && (sallb != sbrackets)) {
            --sallb;
        } else if (value[i] == ']' && (sallb == sbrackets)) {
            flayers = 1;
            sbracket_table = (int*)realloc(sbracket_table, sizeof(int) * sbrackets);
            sbracket_table[sbrackets - 1] = i;
        }
    }
    lbrackets = 0;
    sbrackets = 0;
    // 在这里,由于函数是递归调用的关系,两个表只存储“第一层”括号的内容

    JSON *res = malloc(sizeof(JSON));
    JSON *pre = res;
    JSON *find = res -> next;
    char *middle;

    for (int i = 0; i < vallen; ++i) {
        if (blank_state && isspace(value[i])) {
            continue;
        } else if (blank_state && !isspace(value[i])) {
            switch (value[i]) {
                case '{':
                    find = parse_object(middle = substr(value + i, '{', '}'));
                    free(middle);
                    i = lbracket_table[lbrackets++] + 1;
                    pre -> next = find;
                    find -> last = pre;
                    pre = pre -> next;
                    find = find -> next;
                    break;
                case '[':
                    find = parse_array(middle = substr(value + i, '[', ']'));
                    free(middle);
                    i = sbracket_table[sbrackets++] + 1;
                    pre -> next = find;
                    find -> last = pre;
                    pre = pre -> next;
                    find = find -> next;
                    break;
                case '\"':
                    find = parse_string(value + i);
                    pre -> next = find;
                    find -> last = pre;
                    pre = pre -> next;
                    find = find -> next;
                    i += next_mark(value + i + 1, '\"'); // 返回出现第二个参数对应字符的下一个位置
                    break;
                case 'f':
                case 't':
                case 'n': // 猜测是否为 false、true 或者 null,在函数里验证
                    find = parse_bool_null(value + i);
                    pre -> next = find;
                    find -> last = pre;
                    pre = pre -> next;
                    find = find -> next;
                    blank_state = 0;
                    break;
                case '+':
                case '-':
                default:
                    find = parse_number(value + i);
                    pre -> next = find;
                    find -> last = pre;
                    pre = pre -> next;
                    find = find -> next;
                    blank_state = 0;
                    break;
            }
        } else {
            switch (value[i]) {
                case ',':
                    blank_state = 1;
                    continue;
                    break;
                default:
                    continue;
                    break;
            }
        }
    }
    pre = res -> next;
    free(res);
    res = pre;
    res -> last = NULL;

    free(lbracket_table);
    free(sbracket_table);

    return res;
}

附上 parse_object 和 parse_array 的代码:

JSON *parse_object(const char *value)
{
    JSON *res = CreateObject();
    JSON *tmp;
    int blank_state = 1;
    char *left;
    char *right;

    for (int i = 0; i < strlen(value); ++i) {
        if (blank_state && isspace(value[i])) {
            continue;
        } else if (blank_state && !isspace(value[i])) {
            left = divstr(value + i); // 返回第一个在所有括号之外的 “:” 或者 “,” 的位置
            i += next_mark(value + i, ':') + 1;
            AddItemToObject(res, (tmp = parse_string(left)) -> valuestring, ParseJSON(right = divstr(value + i)));
            free(left);
            free(right);
            free(tmp);
            i += next_mark(value + i, ','); // when this loop end, i will be added by 1
            blank_state = 1; // so here we don't have 1 as an addition
        }
    }
    return res;
}

JSON *parse_array(const char *value)
{
    JSON *res = CreateArray();
    char *middle;
    int blank_state = 1;

    for (int i = 0; i < strlen(value); ++i) {
        if (blank_state && isspace(value[i])) {
            continue;
        } else if (blank_state && !isspace(value[i])) {
            AddItemToArray(res, ParseJSON(middle = divstr(value + i)));
            free(middle);
            i += next_mark(value + i, ',');
            blank_state = 1;
        }
    }
    return res;
}

虽然自己接触C语言有些久了,但是真正写出一个达千行的程序,这还是第一次。寒假在家里反复改反复调试,看来自己代码能力还是很不行。虽然在高手们看来可能这个程序还很拙劣,不过成功运行之后还是很有成就感的。加油吧。