unleashed-firmware/lib/mjs/mjs_string.c

/*
 * Copyright (c) 2017 Cesanta Software Limited
 * All rights reserved
 */

#include "mjs_string.h"
#include "common/cs_varint.h"
#include "common/mg_str.h"
#include "mjs_core.h"
#include "mjs_internal.h"
#include "mjs_primitive.h"
#include "mjs_util.h"

// No UTF
typedef unsigned short Rune;
static int chartorune(Rune* rune, const char* str) {
    *rune = *(unsigned char*)str;
    return 1;
}
static int runetochar(char* str, Rune* rune) {
    str[0] = (char)*rune;
    return 1;
}

#ifndef MJS_STRING_BUF_RESERVE
#define MJS_STRING_BUF_RESERVE 100
#endif

MJS_PRIVATE size_t unescape(const char* s, size_t len, char* to);

MJS_PRIVATE void embed_string(
    struct mbuf* m,
    size_t offset,
    const char* p,
    size_t len,
    uint8_t /*enum embstr_flags*/ flags);

/* TODO(lsm): NaN payload location depends on endianness, make crossplatform */
#define GET_VAL_NAN_PAYLOAD(v) ((char*)&(v))

int mjs_is_string(mjs_val_t v) {
    uint64_t t = v & MJS_TAG_MASK;
    return t == MJS_TAG_STRING_I || t == MJS_TAG_STRING_F || t == MJS_TAG_STRING_O ||
           t == MJS_TAG_STRING_5 || t == MJS_TAG_STRING_D;
}

mjs_val_t mjs_mk_string(struct mjs* mjs, const char* p, size_t len, int copy) {
    struct mbuf* m;
    mjs_val_t offset, tag = MJS_TAG_STRING_F;
    if(len == 0) {
        /*
     * Zero length for foreign string has a special meaning (that the foreign
     * string is not inlined into mjs_val_t), so when creating a zero-length
     * string, we always assume it'll be owned. Since the length is zero, it
     * doesn't matter anyway.
     */
        copy = 1;
    }
    m = copy ? &mjs->owned_strings : &mjs->foreign_strings;
    offset = m->len;

    if(len == ~((size_t)0)) len = strlen(p);

    if(copy) {
        /* owned string */
        if(len <= 4) {
            char* s = GET_VAL_NAN_PAYLOAD(offset) + 1;
            offset = 0;
            if(p != 0) {
                memcpy(s, p, len);
            }
            s[-1] = len;
            tag = MJS_TAG_STRING_I;
        } else if(len == 5) {
            char* s = GET_VAL_NAN_PAYLOAD(offset);
            offset = 0;
            if(p != 0) {
                memcpy(s, p, len);
            }
            tag = MJS_TAG_STRING_5;
            // } else if ((dict_index = v_find_string_in_dictionary(p, len)) >= 0) {
            //   offset = 0;
            //   GET_VAL_NAN_PAYLOAD(offset)[0] = dict_index;
            //   tag = MJS_TAG_STRING_D;
        } else {
            if(gc_strings_is_gc_needed(mjs)) {
                mjs->need_gc = 1;
            }

            /*
       * Before embedding new string, check if the reallocation is needed.  If
       * so, perform the reallocation by calling `mbuf_resize` manually, since
       * we need to preallocate some extra space (`MJS_STRING_BUF_RESERVE`)
       */
            if((m->len + len) > m->size) {
                char* prev_buf = m->buf;
                mbuf_resize(m, m->len + len + MJS_STRING_BUF_RESERVE);

                /*
         * There is a corner case: when the source pointer is located within
         * the mbuf. In this case, we should adjust the pointer, because it
         * might have just been reallocated.
         */
                if(p >= prev_buf && p < (prev_buf + m->len)) {
                    p += (m->buf - prev_buf);
                }
            }

            embed_string(m, m->len, p, len, EMBSTR_ZERO_TERM);
            tag = MJS_TAG_STRING_O;
        }
    } else {
        /* foreign string */
        if(sizeof(void*) <= 4 && len <= (1 << 15)) {
            /* small foreign strings can fit length and ptr in the mjs_val_t */
            offset = (uint64_t)len << 32 | (uint64_t)(uintptr_t)p;
        } else {
            /* bigger strings need indirection that uses ram */
            size_t pos = m->len;
            size_t llen = cs_varint_llen(len);

            /* allocate space for len and ptr */
            mbuf_insert(m, pos, NULL, llen + sizeof(p));

            cs_varint_encode(len, (uint8_t*)(m->buf + pos), llen);
            memcpy(m->buf + pos + llen, &p, sizeof(p));
        }
        tag = MJS_TAG_STRING_F;
    }

    /* NOTE(lsm): don't use pointer_to_value, 32-bit ptrs will truncate */
    return (offset & ~MJS_TAG_MASK) | tag;
}

/* Get a pointer to string and string length. */
const char* mjs_get_string(struct mjs* mjs, mjs_val_t* v, size_t* sizep) {
    uint64_t tag = v[0] & MJS_TAG_MASK;
    const char* p = NULL;
    size_t size = 0, llen;

    if(!mjs_is_string(*v)) {
        goto clean;
    }

    if(tag == MJS_TAG_STRING_I) {
        p = GET_VAL_NAN_PAYLOAD(*v) + 1;
        size = p[-1];
    } else if(tag == MJS_TAG_STRING_5) {
        p = GET_VAL_NAN_PAYLOAD(*v);
        size = 5;
        // } else if (tag == MJS_TAG_STRING_D) {
        //   int index = ((unsigned char *) GET_VAL_NAN_PAYLOAD(*v))[0];
        //   size = v_dictionary_strings[index].len;
        //   p = v_dictionary_strings[index].p;
    } else if(tag == MJS_TAG_STRING_O) {
        size_t offset = (size_t)gc_string_mjs_val_to_offset(*v);
        char* s = mjs->owned_strings.buf + offset;
        uint64_t v = 0;
        if(offset < mjs->owned_strings.len &&
           cs_varint_decode((uint8_t*)s, mjs->owned_strings.len - offset, &v, &llen)) {
            size = v;
            p = s + llen;
        } else {
            goto clean;
        }
    } else if(tag == MJS_TAG_STRING_F) {
        /*
     * short foreign strings on <=32-bit machines can be encoded in a compact
     * form:
     *
     *     7         6        5        4        3        2        1        0
     *  11111111|1111tttt|llllllll|llllllll|ssssssss|ssssssss|ssssssss|ssssssss
     *
     * Strings longer than 2^26 will be indireceted through the foreign_strings
     * mbuf.
     *
     * We don't use a different tag to represent those two cases. Instead, all
     * foreign strings represented with the help of the foreign_strings mbuf
     * will have the upper 16-bits of the payload set to zero. This allows us to
     * represent up to 477 million foreign strings longer than 64k.
     */
        uint16_t len = (*v >> 32) & 0xFFFF;
        if(sizeof(void*) <= 4 && len != 0) {
            size = (size_t)len;
            p = (const char*)(uintptr_t)*v;
        } else {
            size_t offset = (size_t)gc_string_mjs_val_to_offset(*v);
            char* s = mjs->foreign_strings.buf + offset;
            uint64_t v = 0;
            if(offset < mjs->foreign_strings.len &&
               cs_varint_decode((uint8_t*)s, mjs->foreign_strings.len - offset, &v, &llen)) {
                size = v;
                memcpy((char**)&p, s + llen, sizeof(p));
            } else {
                goto clean;
            }
        }
    } else {
        assert(0);
    }

clean:
    if(sizep != NULL) {
        *sizep = size;
    }
    return p;
}

const char* mjs_get_cstring(struct mjs* mjs, mjs_val_t* value) {
    size_t size;
    const char* s = mjs_get_string(mjs, value, &size);
    if(s == NULL) return NULL;
    if(s[size] != 0 || strlen(s) != size) {
        return NULL;
    }
    return s;
}

int mjs_strcmp(struct mjs* mjs, mjs_val_t* a, const char* b, size_t len) {
    size_t n;
    const char* s;
    if(len == (size_t)~0) len = strlen(b);
    s = mjs_get_string(mjs, a, &n);
    if(n != len) {
        return n - len;
    }
    return strncmp(s, b, len);
}

MJS_PRIVATE unsigned long cstr_to_ulong(const char* s, size_t len, int* ok) {
    char* e;
    unsigned long res = strtoul(s, &e, 10);
    *ok = (e == s + len) && len != 0;
    return res;
}

MJS_PRIVATE mjs_err_t str_to_ulong(struct mjs* mjs, mjs_val_t v, int* ok, unsigned long* res) {
    enum mjs_err ret = MJS_OK;
    size_t len = 0;
    const char* p = mjs_get_string(mjs, &v, &len);
    *res = cstr_to_ulong(p, len, ok);

    return ret;
}

MJS_PRIVATE int s_cmp(struct mjs* mjs, mjs_val_t a, mjs_val_t b) {
    size_t a_len, b_len;
    const char *a_ptr, *b_ptr;

    a_ptr = mjs_get_string(mjs, &a, &a_len);
    b_ptr = mjs_get_string(mjs, &b, &b_len);

    if(a_len == b_len) {
        return memcmp(a_ptr, b_ptr, a_len);
    }
    if(a_len > b_len) {
        return 1;
    } else if(a_len < b_len) {
        return -1;
    } else {
        return 0;
    }
}

MJS_PRIVATE mjs_val_t s_concat(struct mjs* mjs, mjs_val_t a, mjs_val_t b) {
    size_t a_len, b_len, res_len;
    const char *a_ptr, *b_ptr, *res_ptr;
    mjs_val_t res;

    /* Find out lengths of both srtings */
    a_ptr = mjs_get_string(mjs, &a, &a_len);
    b_ptr = mjs_get_string(mjs, &b, &b_len);

    /* Create a placeholder string */
    res = mjs_mk_string(mjs, NULL, a_len + b_len, 1);

    /* mjs_mk_string() may have reallocated mbuf - revalidate pointers */
    a_ptr = mjs_get_string(mjs, &a, &a_len);
    b_ptr = mjs_get_string(mjs, &b, &b_len);

    /* Copy strings into the placeholder */
    res_ptr = mjs_get_string(mjs, &res, &res_len);
    memcpy((char*)res_ptr, a_ptr, a_len);
    memcpy((char*)res_ptr + a_len, b_ptr, b_len);

    return res;
}

MJS_PRIVATE void mjs_string_slice(struct mjs* mjs) {
    int nargs = mjs_nargs(mjs);
    mjs_val_t ret = mjs_mk_number(mjs, 0);
    mjs_val_t beginSlice_v = MJS_UNDEFINED;
    mjs_val_t endSlice_v = MJS_UNDEFINED;
    int beginSlice = 0;
    int endSlice = 0;
    size_t size;
    const char* s = NULL;

    /* get string from `this` */
    if(!mjs_check_arg(mjs, -1 /*this*/, "this", MJS_TYPE_STRING, NULL)) {
        goto clean;
    }
    s = mjs_get_string(mjs, &mjs->vals.this_obj, &size);

    /* get idx from arg 0 */
    if(!mjs_check_arg(mjs, 0, "beginSlice", MJS_TYPE_NUMBER, &beginSlice_v)) {
        goto clean;
    }
    beginSlice = mjs_normalize_idx(mjs_get_int(mjs, beginSlice_v), size);

    if(nargs >= 2) {
        /* endSlice is given; use it */
        /* get idx from arg 0 */
        if(!mjs_check_arg(mjs, 1, "endSlice", MJS_TYPE_NUMBER, &endSlice_v)) {
            goto clean;
        }
        endSlice = mjs_normalize_idx(mjs_get_int(mjs, endSlice_v), size);
    } else {
        /* endSlice is not given; assume the end of the string */
        endSlice = size;
    }

    if(endSlice < beginSlice) {
        endSlice = beginSlice;
    }

    ret = mjs_mk_string(mjs, s + beginSlice, endSlice - beginSlice, 1);

clean:
    mjs_return(mjs, ret);
}

MJS_PRIVATE void mjs_string_index_of(struct mjs* mjs) {
    mjs_val_t ret = mjs_mk_number(mjs, -1);
    mjs_val_t substr_v = MJS_UNDEFINED;
    mjs_val_t idx_v = MJS_UNDEFINED;
    int idx = 0;
    const char *str = NULL, *substr = NULL;
    size_t str_len = 0, substr_len = 0;

    if(!mjs_check_arg(mjs, -1 /* this */, "this", MJS_TYPE_STRING, NULL)) {
        goto clean;
    }
    str = mjs_get_string(mjs, &mjs->vals.this_obj, &str_len);

    if(!mjs_check_arg(mjs, 0, "searchValue", MJS_TYPE_STRING, &substr_v)) {
        goto clean;
    }
    substr = mjs_get_string(mjs, &substr_v, &substr_len);
    if(mjs_nargs(mjs) > 1) {
        if(!mjs_check_arg(mjs, 1, "fromIndex", MJS_TYPE_NUMBER, &idx_v)) {
            goto clean;
        }
        idx = mjs_get_int(mjs, idx_v);
        if(idx < 0) idx = 0;
        if((size_t)idx > str_len) idx = str_len;
    }
    {
        const char* substr_p;
        struct mg_str mgstr, mgsubstr;
        mgstr.p = str + idx;
        mgstr.len = str_len - idx;
        mgsubstr.p = substr;
        mgsubstr.len = substr_len;
        substr_p = mg_strstr(mgstr, mgsubstr);
        if(substr_p != NULL) {
            ret = mjs_mk_number(mjs, (int)(substr_p - str));
        }
    }

clean:
    mjs_return(mjs, ret);
}

MJS_PRIVATE void mjs_string_char_code_at(struct mjs* mjs) {
    mjs_val_t ret = MJS_UNDEFINED;
    mjs_val_t idx_v = MJS_UNDEFINED;
    int idx = 0;
    size_t size;
    const char* s = NULL;

    /* get string from `this` */
    if(!mjs_check_arg(mjs, -1 /*this*/, "this", MJS_TYPE_STRING, NULL)) {
        goto clean;
    }
    s = mjs_get_string(mjs, &mjs->vals.this_obj, &size);

    /* get idx from arg 0 */
    if(!mjs_check_arg(mjs, 0, "index", MJS_TYPE_NUMBER, &idx_v)) {
        goto clean;
    }
    idx = mjs_normalize_idx(mjs_get_int(mjs, idx_v), size);
    if(idx >= 0 && idx < (int)size) {
        ret = mjs_mk_number(mjs, ((unsigned char*)s)[idx]);
    }

clean:
    mjs_return(mjs, ret);
}

MJS_PRIVATE void mjs_mkstr(struct mjs* mjs) {
    int nargs = mjs_nargs(mjs);
    mjs_val_t ret = MJS_UNDEFINED;

    char* ptr = NULL;
    int offset = 0;
    int len = 0;
    int copy = 0;

    mjs_val_t ptr_v = MJS_UNDEFINED;
    mjs_val_t offset_v = MJS_UNDEFINED;
    mjs_val_t len_v = MJS_UNDEFINED;
    mjs_val_t copy_v = MJS_UNDEFINED;

    if(nargs == 2) {
        ptr_v = mjs_arg(mjs, 0);
        len_v = mjs_arg(mjs, 1);
    } else if(nargs == 3) {
        ptr_v = mjs_arg(mjs, 0);
        offset_v = mjs_arg(mjs, 1);
        len_v = mjs_arg(mjs, 2);
    } else if(nargs == 4) {
        ptr_v = mjs_arg(mjs, 0);
        offset_v = mjs_arg(mjs, 1);
        len_v = mjs_arg(mjs, 2);
        copy_v = mjs_arg(mjs, 3);
    } else {
        mjs_prepend_errorf(
            mjs,
            MJS_TYPE_ERROR,
            "mkstr takes 2, 3 or 4 arguments: (ptr, len), (ptr, "
            "offset, len) or (ptr, offset, len, copy)");
        goto clean;
    }

    if(!mjs_is_foreign(ptr_v)) {
        mjs_prepend_errorf(mjs, MJS_TYPE_ERROR, "ptr should be a foreign pointer");
        goto clean;
    }

    if(offset_v != MJS_UNDEFINED && !mjs_is_number(offset_v)) {
        mjs_prepend_errorf(mjs, MJS_TYPE_ERROR, "offset should be a number");
        goto clean;
    }

    if(!mjs_is_number(len_v)) {
        mjs_prepend_errorf(mjs, MJS_TYPE_ERROR, "len should be a number");
        goto clean;
    }

    copy = mjs_is_truthy(mjs, copy_v);

    /* all arguments are fine */

    ptr = (char*)mjs_get_ptr(mjs, ptr_v);
    if(offset_v != MJS_UNDEFINED) {
        offset = mjs_get_int(mjs, offset_v);
    }
    len = mjs_get_int(mjs, len_v);

    ret = mjs_mk_string(mjs, ptr + offset, len, copy);

clean:
    mjs_return(mjs, ret);
}

enum unescape_error {
    SLRE_INVALID_HEX_DIGIT,
    SLRE_INVALID_ESC_CHAR,
    SLRE_UNTERM_ESC_SEQ,
};

static int hex(int c) {
    if(c >= '0' && c <= '9') return c - '0';
    if(c >= 'a' && c <= 'f') return c - 'a' + 10;
    if(c >= 'A' && c <= 'F') return c - 'A' + 10;
    return -SLRE_INVALID_HEX_DIGIT;
}

static int nextesc(const char** p) {
    const unsigned char* s = (unsigned char*)(*p)++;
    switch(*s) {
    case 0:
        return -SLRE_UNTERM_ESC_SEQ;
    case 'c':
        ++*p;
        return *s & 31;
    case 'b':
        return '\b';
    case 't':
        return '\t';
    case 'n':
        return '\n';
    case 'v':
        return '\v';
    case 'f':
        return '\f';
    case 'r':
        return '\r';
    case '\\':
        return '\\';
    case 'u':
        if(isxdigit(s[1]) && isxdigit(s[2]) && isxdigit(s[3]) && isxdigit(s[4])) {
            (*p) += 4;
            return hex(s[1]) << 12 | hex(s[2]) << 8 | hex(s[3]) << 4 | hex(s[4]);
        }
        return -SLRE_INVALID_HEX_DIGIT;
    case 'x':
        if(isxdigit(s[1]) && isxdigit(s[2])) {
            (*p) += 2;
            return (hex(s[1]) << 4) | hex(s[2]);
        }
        return -SLRE_INVALID_HEX_DIGIT;
    default:
        return -SLRE_INVALID_ESC_CHAR;
    }
}

MJS_PRIVATE size_t unescape(const char* s, size_t len, char* to) {
    const char* end = s + len;
    size_t n = 0;
    char tmp[4];
    Rune r;

    while(s < end) {
        s += chartorune(&r, s);
        if(r == '\\' && s < end) {
            switch(*s) {
            case '"':
                s++, r = '"';
                break;
            case '\'':
                s++, r = '\'';
                break;
            case '\n':
                s++, r = '\n';
                break;
            default: {
                const char* tmp_s = s;
                int i = nextesc(&s);
                switch(i) {
                case -SLRE_INVALID_ESC_CHAR:
                    r = '\\';
                    s = tmp_s;
                    n += runetochar(to == NULL ? tmp : to + n, &r);
                    s += chartorune(&r, s);
                    break;
                case -SLRE_INVALID_HEX_DIGIT:
                default:
                    r = i;
                }
            }
            }
        }
        n += runetochar(to == NULL ? tmp : to + n, &r);
    }

    return n;
}

MJS_PRIVATE void embed_string(
    struct mbuf* m,
    size_t offset,
    const char* p,
    size_t len,
    uint8_t /*enum embstr_flags*/ flags) {
    char* old_base = m->buf;
    uint8_t p_backed_by_mbuf = p >= old_base && p < old_base + m->len;
    size_t n = (flags & EMBSTR_UNESCAPE) ? unescape(p, len, NULL) : len;

    /* Calculate how many bytes length takes */
    size_t k = cs_varint_llen(n);

    /* total length: varing length + string len + zero-term */
    size_t tot_len = k + n + !!(flags & EMBSTR_ZERO_TERM);

    /* Allocate buffer */
    mbuf_insert(m, offset, NULL, tot_len);

    /* Fixup p if it was relocated by mbuf_insert() above */
    if(p_backed_by_mbuf) {
        p += m->buf - old_base;
    }

    /* Write length */
    cs_varint_encode(n, (unsigned char*)m->buf + offset, k);

    /* Write string */
    if(p != 0) {
        if(flags & EMBSTR_UNESCAPE) {
            unescape(p, len, m->buf + offset + k);
        } else {
            memcpy(m->buf + offset + k, p, len);
        }
    }

    /* add NULL-terminator if needed */
    if(flags & EMBSTR_ZERO_TERM) {
        m->buf[offset + tot_len - 1] = '\0';
    }
}