view markdown_converter/wasm/markdown_to_html_wasm.c @ 168:f3084bca7317

[Misc] Fixed all errors and all tests should pass now.
author MrJuneJune <me@mrjunejune.com>
date Mon, 19 Jan 2026 16:29:02 -0800
parents cd35e600ae34
children
line wrap: on
line source

/**
 * Markdown to HTML Converter - Standalone WASM Implementation
 * No libc dependencies - can be compiled with: clang --target=wasm32
 */

#define WASM_EXPORT __attribute__((visibility("default")))

typedef unsigned long size_t;
typedef int int32_t;

// Simple bump allocator for WASM
#define HEAP_SIZE (1024 * 1024)  // 1MB heap
static char heap[HEAP_SIZE];
static size_t heap_offset = 0;

WASM_EXPORT void *malloc(size_t size)
{
  // Align to 8 bytes
  size_t aligned_offset = (heap_offset + 7) & ~7;
  if (aligned_offset + size > HEAP_SIZE) return 0;

  void *ptr = &heap[aligned_offset];
  heap_offset = aligned_offset + size;
  return ptr;
}

WASM_EXPORT void free(void *ptr)
{
  // Simple bump allocator - no actual free
  (void)ptr;
}

WASM_EXPORT void heap_reset(void)
{
  heap_offset = 0;
}

// String functions
static size_t strlen(const char *s)
{
  size_t len = 0;
  while (s[len]) len++;
  return len;
}

static void *memcpy(void *dest, const void *src, size_t n)
{
  char *d = (char *)dest;
  const char *s = (const char *)src;
  while (n--) *d++ = *s++;
  return dest;
}

static int isspace_c(int c)
{
  return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v';
}

static int isdigit_c(int c)
{
  return c >= '0' && c <= '9';
}

// String buffer for building HTML output
typedef struct {
  char  *data;
  size_t length;
  size_t capacity;
} StringBuffer;

static StringBuffer *buffer_create(size_t initial_capacity)
{
  StringBuffer *buf = (StringBuffer *)malloc(sizeof(StringBuffer));
  if (!buf) return 0;

  buf->data = (char *)malloc(initial_capacity);
  if (!buf->data) return 0;

  buf->data[0] = '\0';
  buf->length = 0;
  buf->capacity = initial_capacity;
  return buf;
}

static void buffer_grow(StringBuffer *buf, size_t needed)
{
  if (buf->length + needed + 1 > buf->capacity) {
    size_t new_capacity = buf->capacity * 2;
    while (new_capacity < buf->length + needed + 1)
      new_capacity *= 2;

    char *new_data = (char *)malloc(new_capacity);
    if (new_data) {
      memcpy(new_data, buf->data, buf->length + 1);
      buf->data = new_data;
      buf->capacity = new_capacity;
    }
  }
}

static void buffer_append(StringBuffer *buf, const char *str)
{
  size_t len = strlen(str);
  buffer_grow(buf, len);
  memcpy(buf->data + buf->length, str, len + 1);
  buf->length += len;
}

static void buffer_append_n(StringBuffer *buf, const char *str, size_t n)
{
  buffer_grow(buf, n);
  memcpy(buf->data + buf->length, str, n);
  buf->length += n;
  buf->data[buf->length] = '\0';
}

static void buffer_append_char(StringBuffer *buf, char c)
{
  buffer_grow(buf, 1);
  buf->data[buf->length++] = c;
  buf->data[buf->length] = '\0';
}

// Check if line starts with pattern (after trimming whitespace)
static int starts_with(const char *line, const char *pattern)
{
  while (*line && isspace_c(*line)) line++;
  size_t plen = strlen(pattern);
  for (size_t i = 0; i < plen; i++) {
    if (line[i] != pattern[i]) return 0;
  }
  return 1;
}

// Count leading # characters
static int count_heading_level(const char *line)
{
  int count = 0;
  while (*line && isspace_c(*line)) line++;
  while (line[count] == '#' && count < 6) count++;
  if (count > 0 && line[count] == ' ') return count;
  return 0;
}

// Skip whitespace
static const char *skip_whitespace(const char *str)
{
  while (*str && isspace_c(*str)) str++;
  return str;
}

// Check if line is empty
static int is_empty_line(const char *line)
{
  while (*line) {
    if (!isspace_c(*line)) return 0;
    line++;
  }
  return 1;
}

// Check if line is horizontal rule
static int is_horizontal_rule(const char *line)
{
  line = skip_whitespace(line);
  char first = *line;
  if (first != '-' && first != '*' && first != '_') return 0;

  int count = 0;
  while (*line) {
    if (*line == first) count++;
    else if (!isspace_c(*line)) return 0;
    line++;
  }
  return count >= 3;
}

// Check if line is unordered list item
static int is_unordered_list(const char *line)
{
  line = skip_whitespace(line);
  return (*line == '-' || *line == '*' || *line == '+') && line[1] == ' ';
}

// Check if line is ordered list item
static int is_ordered_list(const char *line)
{
  line = skip_whitespace(line);
  while (*line && isdigit_c(*line)) line++;
  return *line == '.' && line[1] == ' ';
}

// Check if line is a table row (starts with |)
static int is_table_row(const char *line)
{
  line = skip_whitespace(line);
  return *line == '|';
}

// Check if line is a table separator row (| --- | --- |)
static int is_table_separator(const char *line)
{
  line = skip_whitespace(line);
  if (*line != '|') return 0;
  line++;

  int has_dash = 0;
  while (*line) {
    if (*line == '-') has_dash = 1;
    else if (*line == '|' || *line == ':' || isspace_c(*line)) { /* ok */ }
    else return 0;
    line++;
  }
  return has_dash;
}

// Forward declaration for process_inline
static void process_inline(StringBuffer *buf, const char *text, size_t len);

// Parse table cells from a row and append to buffer
static void parse_table_row(StringBuffer *buf, const char *line, int is_header)
{
  const char *cell_tag = is_header ? "th" : "td";

  buffer_append(buf, "<tr>");

  line = skip_whitespace(line);
  if (*line == '|') line++; // Skip leading |

  while (*line) {
    // Skip whitespace before cell content
    while (*line && isspace_c(*line)) line++;

    // Find cell end (next | or end of line)
    const char *cell_start = line;
    while (*line && *line != '|') line++;

    // Trim trailing whitespace from cell
    const char *cell_end = line;
    while (cell_end > cell_start && isspace_c(*(cell_end - 1))) cell_end--;

    size_t cell_len = cell_end - cell_start;

    // Only output cell if we have content or more cells coming
    if (cell_len > 0 || *line == '|') {
      buffer_append(buf, "<");
      buffer_append(buf, cell_tag);
      buffer_append(buf, ">");
      if (cell_len > 0) {
        process_inline(buf, cell_start, cell_len);
      }
      buffer_append(buf, "</");
      buffer_append(buf, cell_tag);
      buffer_append(buf, ">");
    }

    if (*line == '|') line++; // Skip |

    // Check if this was the trailing |
    const char *rest = line;
    while (*rest && isspace_c(*rest)) rest++;
    if (!*rest) break; // End of line after trailing |
  }

  buffer_append(buf, "</tr>");
}

// Process inline markdown
static void process_inline(StringBuffer *buf, const char *text, size_t len)
{
  size_t i = 0;

  while (i < len) {
    // Links: [text](url)
    if (text[i] == '[') {
      size_t link_start = i + 1;
      size_t link_end = link_start;
      while (link_end < len && text[link_end] != ']') link_end++;

      if (link_end < len && link_end + 1 < len && text[link_end + 1] == '(') {
        size_t url_start = link_end + 2;
        size_t url_end = url_start;
        while (url_end < len && text[url_end] != ')') url_end++;

        if (url_end < len) {
          buffer_append(buf, "<a href=\"");
          buffer_append_n(buf, text + url_start, url_end - url_start);
          buffer_append(buf, "\">");
          buffer_append_n(buf, text + link_start, link_end - link_start);
          buffer_append(buf, "</a>");
          i = url_end + 1;
          continue;
        }
      }
    }

    // Images: ![alt](url)
    if (text[i] == '!' && i + 1 < len && text[i + 1] == '[') {
      size_t alt_start = i + 2;
      size_t alt_end = alt_start;
      while (alt_end < len && text[alt_end] != ']') alt_end++;

      if (alt_end < len && alt_end + 1 < len && text[alt_end + 1] == '(') {
        size_t url_start = alt_end + 2;
        size_t url_end = url_start;
        while (url_end < len && text[url_end] != ')') url_end++;

        if (url_end < len) {
          buffer_append(buf, "<img src=\"");
          buffer_append_n(buf, text + url_start, url_end - url_start);
          buffer_append(buf, "\" alt=\"");
          buffer_append_n(buf, text + alt_start, alt_end - alt_start);
          buffer_append(buf, "\">");
          i = url_end + 1;
          continue;
        }
      }
    }

    // Bold: **text** or __text__
    if ((text[i] == '*' && i + 1 < len && text[i + 1] == '*') ||
        (text[i] == '_' && i + 1 < len && text[i + 1] == '_')) {
      char marker = text[i];
      size_t start = i + 2;
      size_t end = start;
      while (end + 1 < len && !(text[end] == marker && text[end + 1] == marker)) end++;

      if (end + 1 < len) {
        buffer_append(buf, "<strong>");
        process_inline(buf, text + start, end - start);
        buffer_append(buf, "</strong>");
        i = end + 2;
        continue;
      }
    }

    // Strikethrough: ~~text~~
    if (text[i] == '~' && i + 1 < len && text[i + 1] == '~') {
      size_t start = i + 2;
      size_t end = start;
      while (end + 1 < len && !(text[end] == '~' && text[end + 1] == '~')) end++;

      if (end + 1 < len) {
        buffer_append(buf, "<del>");
        process_inline(buf, text + start, end - start);
        buffer_append(buf, "</del>");
        i = end + 2;
        continue;
      }
    }

    // Italic: *text* or _text_
    if ((text[i] == '*' || text[i] == '_') && i + 1 < len && !isspace_c(text[i + 1])) {
      char marker = text[i];
      size_t start = i + 1;
      size_t end = start;
      while (end < len && text[end] != marker) end++;

      if (end < len && end > start) {
        buffer_append(buf, "<em>");
        process_inline(buf, text + start, end - start);
        buffer_append(buf, "</em>");
        i = end + 1;
        continue;
      }
    }

    // Inline code: `code`
    if (text[i] == '`') {
      size_t start = i + 1;
      size_t end = start;
      while (end < len && text[end] != '`') end++;

      if (end < len) {
        buffer_append(buf, "<code>");
        buffer_append_n(buf, text + start, end - start);
        buffer_append(buf, "</code>");
        i = end + 1;
        continue;
      }
    }

    // HTML escape
    if (text[i] == '<') {
      buffer_append(buf, "&lt;");
    } else if (text[i] == '>') {
      buffer_append(buf, "&gt;");
    } else if (text[i] == '&') {
      buffer_append(buf, "&amp;");
    } else {
      buffer_append_char(buf, text[i]);
    }
    i++;
  }
}

// Append heading tag
static void append_heading_tag(StringBuffer *buf, int level, int closing)
{
  buffer_append_char(buf, '<');
  if (closing) buffer_append_char(buf, '/');
  buffer_append_char(buf, 'h');
  buffer_append_char(buf, '0' + level);
  buffer_append_char(buf, '>');
}

// Convert markdown to HTML
WASM_EXPORT char *markdown_to_html(const char *markdown)
{
  if (!markdown) return 0;

  StringBuffer *buf = buffer_create(4096);
  if (!buf) return 0;

  const char *ptr = markdown;
  const char *line_start;

  while (*ptr) {
    line_start = ptr;

    // Find end of line
    while (*ptr && *ptr != '\n') ptr++;
    size_t line_len = ptr - line_start;

    // Create line copy
    char *line = (char *)malloc(line_len + 1);
    if (!line) return buf->data;
    memcpy(line, line_start, line_len);
    line[line_len] = '\0';

    // Skip empty lines
    if (is_empty_line(line)) {
      if (*ptr == '\n') ptr++;
      continue;
    }

    // Headings
    int heading_level = count_heading_level(line);
    if (heading_level > 0) {
      const char *content = skip_whitespace(line);
      while (*content == '#') content++;
      content = skip_whitespace(content);

      append_heading_tag(buf, heading_level, 0);
      process_inline(buf, content, strlen(content));
      append_heading_tag(buf, heading_level, 1);

      if (*ptr == '\n') ptr++;
      continue;
    }

    // Code block
    if (starts_with(line, "```")) {
      buffer_append(buf, "<pre><code>");
      if (*ptr == '\n') ptr++;

      while (*ptr) {
        line_start = ptr;
        while (*ptr && *ptr != '\n') ptr++;
        line_len = ptr - line_start;

        char *code_line = (char *)malloc(line_len + 1);
        if (!code_line) break;
        memcpy(code_line, line_start, line_len);
        code_line[line_len] = '\0';

        if (starts_with(code_line, "```")) {
          if (*ptr == '\n') ptr++;
          break;
        }

        for (size_t i = 0; i < line_len; i++) {
          if (code_line[i] == '<') buffer_append(buf, "&lt;");
          else if (code_line[i] == '>') buffer_append(buf, "&gt;");
          else if (code_line[i] == '&') buffer_append(buf, "&amp;");
          else buffer_append_char(buf, code_line[i]);
        }
        buffer_append_char(buf, '\n');

        if (*ptr == '\n') ptr++;
      }

      buffer_append(buf, "</code></pre>");
      continue;
    }

    // Blockquote
    if (starts_with(line, ">")) {
      buffer_append(buf, "<blockquote>");

      while (1) {
        const char *content = skip_whitespace(line);
        if (*content == '>') content++;
        content = skip_whitespace(content);
        process_inline(buf, content, strlen(content));
        buffer_append_char(buf, ' ');

        if (*ptr == '\n') ptr++;
        if (!*ptr) break;

        line_start = ptr;
        while (*ptr && *ptr != '\n') ptr++;
        line_len = ptr - line_start;

        line = (char *)malloc(line_len + 1);
        if (!line) break;
        memcpy(line, line_start, line_len);
        line[line_len] = '\0';

        if (!starts_with(line, ">")) {
          ptr = line_start;
          break;
        }
      }

      buffer_append(buf, "</blockquote>");
      continue;
    }

    // Horizontal rule
    if (is_horizontal_rule(line)) {
      buffer_append(buf, "<hr>");
      if (*ptr == '\n') ptr++;
      continue;
    }

    // Unordered list
    if (is_unordered_list(line)) {
      buffer_append(buf, "<ul>");

      while (1) {
        const char *content = skip_whitespace(line);
        content += 2;

        buffer_append(buf, "<li>");
        process_inline(buf, content, strlen(content));
        buffer_append(buf, "</li>");

        if (*ptr == '\n') ptr++;
        if (!*ptr) break;

        line_start = ptr;
        while (*ptr && *ptr != '\n') ptr++;
        line_len = ptr - line_start;

        line = (char *)malloc(line_len + 1);
        if (!line) break;
        memcpy(line, line_start, line_len);
        line[line_len] = '\0';

        if (!is_unordered_list(line)) {
          ptr = line_start;
          break;
        }
      }

      buffer_append(buf, "</ul>");
      continue;
    }

    // Ordered list
    if (is_ordered_list(line)) {
      buffer_append(buf, "<ol>");

      while (1) {
        const char *content = skip_whitespace(line);
        while (*content && isdigit_c(*content)) content++;
        if (*content == '.') content++;
        content = skip_whitespace(content);

        buffer_append(buf, "<li>");
        process_inline(buf, content, strlen(content));
        buffer_append(buf, "</li>");

        if (*ptr == '\n') ptr++;
        if (!*ptr) break;

        line_start = ptr;
        while (*ptr && *ptr != '\n') ptr++;
        line_len = ptr - line_start;

        line = (char *)malloc(line_len + 1);
        if (!line) break;
        memcpy(line, line_start, line_len);
        line[line_len] = '\0';

        if (!is_ordered_list(line)) {
          ptr = line_start;
          break;
        }
      }

      buffer_append(buf, "</ol>");
      continue;
    }

    // Table
    if (is_table_row(line)) {
      // Check if next line is a separator (to confirm this is a table)
      const char *peek_ptr = ptr;
      if (*peek_ptr == '\n') peek_ptr++;

      const char *next_line_start = peek_ptr;
      while (*peek_ptr && *peek_ptr != '\n') peek_ptr++;
      size_t next_line_len = peek_ptr - next_line_start;

      char *next_line = (char *)malloc(next_line_len + 1);
      if (next_line) {
        memcpy(next_line, next_line_start, next_line_len);
        next_line[next_line_len] = '\0';

        if (is_table_separator(next_line)) {
          // It's a valid table
          buffer_append(buf, "<table>");

          // Header row
          buffer_append(buf, "<thead>");
          parse_table_row(buf, line, 1);
          buffer_append(buf, "</thead>");

          // Skip to after separator
          if (*ptr == '\n') ptr++;
          ptr = peek_ptr;
          if (*ptr == '\n') ptr++;

          // Body rows
          buffer_append(buf, "<tbody>");
          while (*ptr) {
            line_start = ptr;
            while (*ptr && *ptr != '\n') ptr++;
            line_len = ptr - line_start;

            line = (char *)malloc(line_len + 1);
            if (!line) break;
            memcpy(line, line_start, line_len);
            line[line_len] = '\0';

            if (!is_table_row(line) || is_empty_line(line)) {
              ptr = line_start;
              break;
            }

            parse_table_row(buf, line, 0);
            if (*ptr == '\n') ptr++;
          }
          buffer_append(buf, "</tbody>");

          buffer_append(buf, "</table>");
          continue;
        }
      }
    }

    // Paragraph
    buffer_append(buf, "<p>");

    while (1) {
      const char *content = skip_whitespace(line);
      process_inline(buf, content, strlen(content));

      if (*ptr == '\n') ptr++;
      if (!*ptr) break;

      line_start = ptr;
      while (*ptr && *ptr != '\n') ptr++;
      line_len = ptr - line_start;

      line = (char *)malloc(line_len + 1);
      if (!line) break;
      memcpy(line, line_start, line_len);
      line[line_len] = '\0';

      if (is_empty_line(line) ||
          count_heading_level(line) > 0 ||
          starts_with(line, "```") ||
          starts_with(line, ">") ||
          is_horizontal_rule(line) ||
          is_unordered_list(line) ||
          is_ordered_list(line) ||
          is_table_row(line)) {
        ptr = line_start;
        break;
      }

      buffer_append_char(buf, ' ');
    }

    buffer_append(buf, "</p>");
  }

  return buf->data;
}

// Get string length (for JS interop)
WASM_EXPORT size_t markdown_strlen(const char *str)
{
  return str ? strlen(str) : 0;
}