view markdown_converter/markdown_to_html.c @ 200:90dfcef375fb

Added my own s3 bucket uploader url to mrjunejune.
author MrJuneJune <me@mrjunejune.com>
date Sat, 14 Feb 2026 16:32:24 -0800
parents a2725419f988
children
line wrap: on
line source

#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include "markdown_converter/markdown_to_html.h"

// JavaScript needs this to free the memory later
MDAPI void *wasm_alloc(size_t size) {
    return malloc(size);
}

MDAPI void wasm_free(void* ptr) {
    free(ptr);
}

#define INITIAL_BUFFER_SIZE 1024 * 1024 // 1MB

// String buffer for building HTML output
typedef struct {
  char  *data;
  size_t length;
  size_t capacity;
} StringBuffer;

static StringBuffer *buffer_create(size_t initial_capacity)
{
  StringBuffer *buf = (StringBuffer *)malloc(sizeof(StringBuffer));
  if (!buf) return NULL;

  buf->data = (char *)malloc(initial_capacity);
  if (!buf->data) {
    free(buf);
    return NULL;
  }
  buf->data[0] = '\0';
  buf->length = 0;
  buf->capacity = initial_capacity;
  return buf;
}

static void buffer_grow(StringBuffer *buf, size_t needed)
{
  if (buf->length + needed + 1 > buf->capacity) {
    size_t new_capacity = buf->capacity * 2;
    while (new_capacity < buf->length + needed + 1)
      new_capacity *= 2;

    char *new_data = (char *)realloc(buf->data, new_capacity);
    if (new_data) {
      buf->data = new_data;
      buf->capacity = new_capacity;
    }
  }
}

static void buffer_append(StringBuffer *buf, const char *str)
{
  size_t len = strlen(str);
  buffer_grow(buf, len);
  memcpy(buf->data + buf->length, str, len + 1);
  buf->length += len;
}

static void buffer_append_n(StringBuffer *buf, const char *str, size_t n)
{
  buffer_grow(buf, n);
  memcpy(buf->data + buf->length, str, n);
  buf->length += n;
  buf->data[buf->length] = '\0';
}

static void buffer_append_char(StringBuffer *buf, char c)
{
  buffer_grow(buf, 1);
  buf->data[buf->length++] = c;
  buf->data[buf->length] = '\0';
}

static void buffer_free(StringBuffer *buf)
{
  if (buf) {
    free(buf->data);
    free(buf);
  }
}

// Forward declaration
static void process_inline(StringBuffer *buf, const char *text, size_t len);

// Check if line starts with pattern (after trimming whitespace)
static int starts_with(const char *line, const char *pattern)
{
  while (*line && isspace((unsigned char)*line)) line++;
  return strncmp(line, pattern, strlen(pattern)) == 0;
}

// Count leading # characters
static int count_heading_level(const char *line)
{
  int count = 0;
  while (*line && isspace((unsigned char)*line)) line++;
  while (line[count] == '#' && count < 6) count++;
  if (count > 0 && line[count] == ' ') return count;
  return 0;
}

// Skip whitespace
static const char *skip_whitespace(const char *str)
{
  while (*str && isspace((unsigned char)*str)) str++;
  return str;
}

// Check if line is empty (only whitespace)
static int is_empty_line(const char *line)
{
  while (*line) {
    if (!isspace((unsigned char)*line)) return 0;
    line++;
  }
  return 1;
}

// Check if line is horizontal rule (---, ***, ___)
static int is_horizontal_rule(const char *line)
{
  line = skip_whitespace(line);
  char first = *line;
  if (first != '-' && first != '*' && first != '_') return 0;

  int count = 0;
  while (*line) {
    if (*line == first) count++;
    else if (!isspace((unsigned char)*line)) return 0;
    line++;
  }
  return count >= 3;
}

// Check if line is unordered list item
static int is_unordered_list(const char *line)
{
  line = skip_whitespace(line);
  return (*line == '-' || *line == '*' || *line == '+') && line[1] == ' ';
}

// Check if line starts with an HTML tag
static int is_html_block_start(const char *line)
{
  line = skip_whitespace(line);
  if (*line != '<') return 0;
  line++;

  // Check for closing tag or comment
  if (*line == '/' || *line == '!') return 1;

  // Check for valid tag name (letter followed by alphanumeric)
  if (!isalpha((unsigned char)*line)) return 0;

  return 1;
}

// Check if line starts with a specific HTML tag (e.g., "script", "style")
static int is_html_tag(const char *line, const char *tag)
{
  line = skip_whitespace(line);
  if (*line != '<') return 0;
  line++;

  // Skip optional /
  int is_closing = 0;
  if (*line == '/') {
    is_closing = 1;
    line++;
  }

  size_t tag_len = strlen(tag);
  if (strncasecmp(line, tag, tag_len) != 0) return 0;

  char next = line[tag_len];
  // Tag must be followed by space, >, or end for closing tags
  return next == '>' || next == ' ' || next == '\t' || next == '\n' || next == '\0';
}

// Check if line is ordered list item
static int is_ordered_list(const char *line)
{
  line = skip_whitespace(line);
  while (*line && isdigit((unsigned char)*line)) line++;
  return *line == '.' && line[1] == ' ';
}

// Check if line could be a table row (contains |)
static int is_table_row(const char *line)
{
  line = skip_whitespace(line);
  // Must contain at least one |
  return strchr(line, '|') != NULL;
}

// Check if line is a table separator (|---|---|)
static int is_table_separator(const char *line)
{
  line = skip_whitespace(line);
  int has_dash = 0;
  int has_pipe = 0;

  while (*line) {
    char c = *line;
    if (c == '|') has_pipe = 1;
    else if (c == '-') has_dash = 1;
    else if (c == ':') ; // alignment marker, allowed
    else if (isspace((unsigned char)c)) ; // whitespace allowed
    else return 0; // invalid character for separator
    line++;
  }

  return has_dash && has_pipe;
}

// Parse alignment from separator cell (e.g., ":---:", "---:", ":---")
// Returns: 0 = left (default), 1 = center, 2 = right
static int parse_alignment(const char *cell, size_t len)
{
  // Trim whitespace
  while (len > 0 && isspace((unsigned char)*cell)) { cell++; len--; }
  while (len > 0 && isspace((unsigned char)cell[len-1])) { len--; }

  if (len == 0) return 0;

  int left_colon = (cell[0] == ':');
  int right_colon = (len > 0 && cell[len-1] == ':');

  if (left_colon && right_colon) return 1; // center
  if (right_colon) return 2; // right
  return 0; // left (default)
}

// Count columns in a table row
static int count_table_columns(const char *line)
{
  int count = 0;
  int in_cell = 0;
  line = skip_whitespace(line);

  // Skip leading |
  if (*line == '|') line++;

  while (*line) {
    if (*line == '|') {
      count++;
      in_cell = 0;
    } else if (!isspace((unsigned char)*line)) {
      in_cell = 1;
    }
    line++;
  }

  // Count last cell if there was content after last |
  if (in_cell) count++;

  return count > 0 ? count : 1;
}

// Parse table cells and call callback for each
typedef void (*cell_callback)(StringBuffer *buf, const char *cell, size_t len, int align, int is_header);

static void parse_table_row(StringBuffer *buf, const char *line, int *alignments, int num_cols, int is_header, cell_callback cb)
{
  line = skip_whitespace(line);

  // Skip leading |
  if (*line == '|') line++;

  int col = 0;
  const char *cell_start = line;

  while (*line && col < num_cols) {
    if (*line == '|' || *(line + 1) == '\0') {
      // End of cell
      size_t cell_len = line - cell_start;
      if (*line != '|') cell_len++; // include last char if no trailing |

      // Trim whitespace from cell
      while (cell_len > 0 && isspace((unsigned char)*cell_start)) { cell_start++; cell_len--; }
      while (cell_len > 0 && isspace((unsigned char)cell_start[cell_len-1])) { cell_len--; }

      int align = (alignments && col < num_cols) ? alignments[col] : 0;
      cb(buf, cell_start, cell_len, align, is_header);

      col++;
      cell_start = line + 1;
    }
    line++;
  }

  // Fill remaining columns with empty cells
  while (col < num_cols) {
    cb(buf, "", 0, alignments ? alignments[col] : 0, is_header);
    col++;
  }
}

static void emit_table_cell(StringBuffer *buf, const char *cell, size_t len, int align, int is_header)
{
  const char *tag = is_header ? "th" : "td";
  const char *align_attr = "";

  if (align == 1) align_attr = " style=\"text-align:center\"";
  else if (align == 2) align_attr = " style=\"text-align:right\"";

  buffer_append(buf, "<");
  buffer_append(buf, tag);
  buffer_append(buf, align_attr);
  buffer_append(buf, ">");
  process_inline(buf, cell, len);
  buffer_append(buf, "</");
  buffer_append(buf, tag);
  buffer_append(buf, ">");
}

// Parse alignments from separator row
static void parse_alignments(const char *line, int *alignments, int num_cols)
{
  line = skip_whitespace(line);
  if (*line == '|') line++;

  int col = 0;
  const char *cell_start = line;

  while (*line && col < num_cols) {
    if (*line == '|' || *(line + 1) == '\0') {
      size_t cell_len = line - cell_start;
      if (*line != '|') cell_len++;

      alignments[col] = parse_alignment(cell_start, cell_len);
      col++;
      cell_start = line + 1;
    }
    line++;
  }
}

// Process inline markdown (bold, italic, code, links, strikethrough)
static void process_inline(StringBuffer *buf, const char *text, size_t len)
{
  size_t i = 0;

  while (i < len) {
    // Links: [text](url)
    if (text[i] == '[') {
      size_t link_start = i + 1;
      size_t link_end = link_start;
      while (link_end < len && text[link_end] != ']') link_end++;

      if (link_end < len && link_end + 1 < len && text[link_end + 1] == '(') {
        size_t url_start = link_end + 2;
        size_t url_end = url_start;
        while (url_end < len && text[url_end] != ')') url_end++;

        if (url_end < len) {
          buffer_append(buf, "<a href=\"");
          buffer_append_n(buf, text + url_start, url_end - url_start);
          buffer_append(buf, "\">");
          buffer_append_n(buf, text + link_start, link_end - link_start);
          buffer_append(buf, "</a>");
          i = url_end + 1;
          continue;
        }
      }
    }

    // Images: ![alt](url)
    if (text[i] == '!' && i + 1 < len && text[i + 1] == '[') {
      size_t alt_start = i + 2;
      size_t alt_end = alt_start;
      while (alt_end < len && text[alt_end] != ']') alt_end++;

      if (alt_end < len && alt_end + 1 < len && text[alt_end + 1] == '(') {
        size_t url_start = alt_end + 2;
        size_t url_end = url_start;
        while (url_end < len && text[url_end] != ')') url_end++;

        if (url_end < len) {
          buffer_append(buf, "<img src=\"");
          buffer_append_n(buf, text + url_start, url_end - url_start);
          buffer_append(buf, "\" alt=\"");
          buffer_append_n(buf, text + alt_start, alt_end - alt_start);
          buffer_append(buf, "\">");
          i = url_end + 1;
          continue;
        }
      }
    }

    // Bold: **text** or __text__
    if ((text[i] == '*' && i + 1 < len && text[i + 1] == '*') ||
        (text[i] == '_' && i + 1 < len && text[i + 1] == '_')) {
      char marker = text[i];
      size_t start = i + 2;
      size_t end = start;
      while (end + 1 < len && !(text[end] == marker && text[end + 1] == marker)) end++;

      if (end + 1 < len) {
        buffer_append(buf, "<strong>");
        process_inline(buf, text + start, end - start);
        buffer_append(buf, "</strong>");
        i = end + 2;
        continue;
      }
    }

    // Strikethrough: ~~text~~
    if (text[i] == '~' && i + 1 < len && text[i + 1] == '~') {
      size_t start = i + 2;
      size_t end = start;
      while (end + 1 < len && !(text[end] == '~' && text[end + 1] == '~')) end++;

      if (end + 1 < len) {
        buffer_append(buf, "<del>");
        process_inline(buf, text + start, end - start);
        buffer_append(buf, "</del>");
        i = end + 2;
        continue;
      }
    }

    // Italic: *text* or _text_
    if ((text[i] == '*' || text[i] == '_') && i + 1 < len && !isspace((unsigned char)text[i + 1])) {
      char marker = text[i];
      size_t start = i + 1;
      size_t end = start;
      while (end < len && text[end] != marker) end++;

      if (end < len && end > start) {
        buffer_append(buf, "<em>");
        process_inline(buf, text + start, end - start);
        buffer_append(buf, "</em>");
        i = end + 1;
        continue;
      }
    }

    // Inline code: `code`
    if (text[i] == '`') {
      size_t start = i + 1;
      size_t end = start;
      while (end < len && text[end] != '`') end++;

      if (end < len) {
        buffer_append(buf, "<code>");
        buffer_append_n(buf, text + start, end - start);
        buffer_append(buf, "</code>");
        i = end + 1;
        continue;
      }
    }

    // This might not be needed for now.
    // HTML escape special characters
    // if (text[i] == '<') {
    //   buffer_append(buf, "&lt;");
    // } else if (text[i] == '>') {
    //   buffer_append(buf, "&gt;");
    // } else if (text[i] == '&') {
    //   buffer_append(buf, "&amp;");
    // } else {
    //   buffer_append_char(buf, text[i]);
    // }
    buffer_append_char(buf, text[i]);
    i++;
  }
}

// Convert markdown to HTML
MDAPI char *markdown_to_html(const char *markdown)
{
  if (!markdown) return NULL;

  StringBuffer *buf = buffer_create(INITIAL_BUFFER_SIZE);
  if (!buf) return NULL;

  const char *ptr = markdown;
  const char *line_start;

  while (*ptr) {
    line_start = ptr;

    // Find end of line
    while (*ptr && *ptr != '\n') ptr++;
    size_t line_len = ptr - line_start;

    // Create null-terminated line copy
    char *line = (char *)malloc(line_len + 1);
    if (!line) {
      buffer_free(buf);
      return NULL;
    }
    memcpy(line, line_start, line_len);
    line[line_len] = '\0';

    // Skip empty lines
    if (is_empty_line(line)) {
      free(line);
      if (*ptr == '\n') ptr++;
      continue;
    }

    // Headings: # H1, ## H2, etc.
    int heading_level = count_heading_level(line);
    if (heading_level > 0) {
      const char *content = skip_whitespace(line);
      while (*content == '#') content++;
      content = skip_whitespace(content);

      char tag[8];
      snprintf(tag, sizeof(tag), "<h%d>", heading_level);
      buffer_append(buf, tag);
      process_inline(buf, content, strlen(content));
      snprintf(tag, sizeof(tag), "</h%d>", heading_level);
      buffer_append(buf, tag);

      free(line);
      if (*ptr == '\n') ptr++;
      continue;
    }

    // Code block: ```
    if (starts_with(line, "```")) {
      buffer_append(buf, "<pre><code>");
      free(line);
      if (*ptr == '\n') ptr++;

      // Collect code content
      while (*ptr) {
        line_start = ptr;
        while (*ptr && *ptr != '\n') ptr++;
        line_len = ptr - line_start;

        line = (char *)malloc(line_len + 1);
        if (!line) break;
        memcpy(line, line_start, line_len);
        line[line_len] = '\0';

        if (starts_with(line, "```")) {
          free(line);
          if (*ptr == '\n') ptr++;
          break;
        }

        // Escape HTML in code blocks
        for (size_t i = 0; i < line_len; i++) {
          if (line[i] == '<') buffer_append(buf, "&lt;");
          else if (line[i] == '>') buffer_append(buf, "&gt;");
          else if (line[i] == '&') buffer_append(buf, "&amp;");
          else buffer_append_char(buf, line[i]);
        }
        buffer_append_char(buf, '\n');

        free(line);
        if (*ptr == '\n') ptr++;
      }

      buffer_append(buf, "</code></pre>");
      continue;
    }

    // Blockquote: >
    if (starts_with(line, ">")) {
      buffer_append(buf, "<blockquote>");

      while (1) {
        const char *content = skip_whitespace(line);
        if (*content == '>') content++;
        content = skip_whitespace(content);
        process_inline(buf, content, strlen(content));
        buffer_append_char(buf, ' ');

        free(line);
        if (*ptr == '\n') ptr++;

        // Check next line
        if (!*ptr) break;
        line_start = ptr;
        while (*ptr && *ptr != '\n') ptr++;
        line_len = ptr - line_start;

        line = (char *)malloc(line_len + 1);
        if (!line) break;
        memcpy(line, line_start, line_len);
        line[line_len] = '\0';

        if (!starts_with(line, ">")) {
          // Put back the line pointer
          ptr = line_start;
          free(line);
          break;
        }
      }

      buffer_append(buf, "</blockquote>");
      continue;
    }

    // Horizontal rule
    if (is_horizontal_rule(line)) {
      buffer_append(buf, "<hr>");
      free(line);
      if (*ptr == '\n') ptr++;
      continue;
    }

    // Unordered list
    if (is_unordered_list(line)) {
      buffer_append(buf, "<ul>");

      while (1) {
        const char *content = skip_whitespace(line);
        content += 2; // Skip "- " or "* " or "+ "

        buffer_append(buf, "<li>");
        process_inline(buf, content, strlen(content));
        buffer_append(buf, "</li>");

        free(line);
        if (*ptr == '\n') ptr++;

        // Check next line
        if (!*ptr) break;
        line_start = ptr;
        while (*ptr && *ptr != '\n') ptr++;
        line_len = ptr - line_start;

        line = (char *)malloc(line_len + 1);
        if (!line) break;
        memcpy(line, line_start, line_len);
        line[line_len] = '\0';

        if (!is_unordered_list(line)) {
          ptr = line_start;
          free(line);
          break;
        }
      }

      buffer_append(buf, "</ul>");
      continue;
    }

    // Ordered list
    if (is_ordered_list(line)) {
      buffer_append(buf, "<ol>");

      while (1) {
        const char *content = skip_whitespace(line);
        while (*content && isdigit((unsigned char)*content)) content++;
        if (*content == '.') content++;
        content = skip_whitespace(content);

        buffer_append(buf, "<li>");
        process_inline(buf, content, strlen(content));
        buffer_append(buf, "</li>");

        free(line);
        if (*ptr == '\n') ptr++;

        // Check next line
        if (!*ptr) break;
        line_start = ptr;
        while (*ptr && *ptr != '\n') ptr++;
        line_len = ptr - line_start;

        line = (char *)malloc(line_len + 1);
        if (!line) break;
        memcpy(line, line_start, line_len);
        line[line_len] = '\0';

        if (!is_ordered_list(line)) {
          ptr = line_start;
          free(line);
          break;
        }
      }

      buffer_append(buf, "</ol>");
      continue;
    }

    // Table: | col1 | col2 | followed by |---|---|
    if (is_table_row(line)) {
      // Peek at next line to see if it's a separator
      const char *peek_ptr = ptr;
      if (*peek_ptr == '\n') peek_ptr++;

      const char *next_line_start = peek_ptr;
      while (*peek_ptr && *peek_ptr != '\n') peek_ptr++;
      size_t next_line_len = peek_ptr - next_line_start;

      char *next_line = (char *)malloc(next_line_len + 1);
      if (next_line) {
        memcpy(next_line, next_line_start, next_line_len);
        next_line[next_line_len] = '\0';

        if (is_table_separator(next_line)) {
          // It's a table!
          int num_cols = count_table_columns(line);
          int *alignments = (int *)calloc(num_cols, sizeof(int));

          buffer_append(buf, "<table>");

          // Header row
          buffer_append(buf, "<thead><tr>");
          parse_table_row(buf, line, NULL, num_cols, 1, emit_table_cell);
          buffer_append(buf, "</tr></thead>");

          free(line);
          if (*ptr == '\n') ptr++;

          // Parse alignments from separator
          parse_alignments(next_line, alignments, num_cols);
          free(next_line);

          // Skip separator line
          ptr = peek_ptr;
          if (*ptr == '\n') ptr++;

          // Body rows
          buffer_append(buf, "<tbody>");

          while (*ptr) {
            line_start = ptr;
            while (*ptr && *ptr != '\n') ptr++;
            line_len = ptr - line_start;

            line = (char *)malloc(line_len + 1);
            if (!line) break;
            memcpy(line, line_start, line_len);
            line[line_len] = '\0';

            if (!is_table_row(line) || is_empty_line(line)) {
              ptr = line_start;
              free(line);
              break;
            }

            buffer_append(buf, "<tr>");
            parse_table_row(buf, line, alignments, num_cols, 0, emit_table_cell);
            buffer_append(buf, "</tr>");

            free(line);
            if (*ptr == '\n') ptr++;
          }

          buffer_append(buf, "</tbody></table>");
          free(alignments);
          continue;
        }
        free(next_line);
      }
    }

    // HTML block - pass through unchanged
    if (is_html_block_start(line)) {
      // Check if it's a script or style tag that needs special handling
      int is_script = is_html_tag(line, "script");
      int is_style = is_html_tag(line, "style");

      if (is_script || is_style) {
        const char *end_tag = is_script ? "</script>" : "</style>";

        // Output the opening line
        buffer_append(buf, line);
        buffer_append_char(buf, '\n');

        free(line);
        if (*ptr == '\n') ptr++;

        // Collect content until closing tag
        while (*ptr) {
          line_start = ptr;
          while (*ptr && *ptr != '\n') ptr++;
          line_len = ptr - line_start;

          line = (char *)malloc(line_len + 1);
          if (!line) break;
          memcpy(line, line_start, line_len);
          line[line_len] = '\0';

          buffer_append(buf, line);
          buffer_append_char(buf, '\n');

          int found_end = (strstr(line, end_tag) != NULL);
          free(line);
          if (*ptr == '\n') ptr++;

          if (found_end) break;
        }
        continue;
      }

      // Regular HTML tag - just pass through the line
      buffer_append(buf, line);
      buffer_append_char(buf, '\n');
      free(line);
      if (*ptr == '\n') ptr++;
      continue;
    }

    // Regular paragraph
    buffer_append(buf, "<p>");

    while (1) {
      const char *content = skip_whitespace(line);
      process_inline(buf, content, strlen(content));

      free(line);
      if (*ptr == '\n') ptr++;

      // Check next line - continue paragraph if not special
      if (!*ptr) break;
      line_start = ptr;
      while (*ptr && *ptr != '\n') ptr++;
      line_len = ptr - line_start;

      line = (char *)malloc(line_len + 1);
      if (!line) break;
      memcpy(line, line_start, line_len);
      line[line_len] = '\0';

      if (is_empty_line(line) ||
          count_heading_level(line) > 0 ||
          starts_with(line, "```") ||
          starts_with(line, ">") ||
          is_horizontal_rule(line) ||
          is_unordered_list(line) ||
          is_ordered_list(line) ||
          is_table_row(line) ||
          is_html_block_start(line)) {
        ptr = line_start;
        free(line);
        break;
      }

      buffer_append_char(buf, ' ');
    }

    buffer_append(buf, "</p>");
  }

  char *result = buf->data;
  free(buf); // Free struct but not data
  return result;
}

// Free the returned HTML string
MDAPI void markdown_free(char *html)
{
  free(html);
}

// Get length of HTML string (for WASM memory allocation)
MDAPI size_t markdown_get_length(const char *html)
{
  return html ? strlen(html) : 0;
}