view markdown_converter/markdown_to_html.c @ 173:827c6ac504cd hg-web

Merged in default here.
author MrJuneJune <me@mrjunejune.com>
date Mon, 19 Jan 2026 18:59:10 -0800
parents 1c0878eb17de
children 8c74204fd362
line wrap: on
line source

#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include "markdown_converter/markdown_to_html.h"

#define INITIAL_BUFFER_SIZE 1024 * 1024 // 1MB

// String buffer for building HTML output
typedef struct {
  char  *data;
  size_t length;
  size_t capacity;
} StringBuffer;

static StringBuffer *buffer_create(size_t initial_capacity)
{
  StringBuffer *buf = (StringBuffer *)malloc(sizeof(StringBuffer));
  if (!buf) return NULL;

  buf->data = (char *)malloc(initial_capacity);
  if (!buf->data) {
    free(buf);
    return NULL;
  }
  buf->data[0] = '\0';
  buf->length = 0;
  buf->capacity = initial_capacity;
  return buf;
}

static void buffer_grow(StringBuffer *buf, size_t needed)
{
  if (buf->length + needed + 1 > buf->capacity) {
    size_t new_capacity = buf->capacity * 2;
    while (new_capacity < buf->length + needed + 1)
      new_capacity *= 2;

    char *new_data = (char *)realloc(buf->data, new_capacity);
    if (new_data) {
      buf->data = new_data;
      buf->capacity = new_capacity;
    }
  }
}

static void buffer_append(StringBuffer *buf, const char *str)
{
  size_t len = strlen(str);
  buffer_grow(buf, len);
  memcpy(buf->data + buf->length, str, len + 1);
  buf->length += len;
}

static void buffer_append_n(StringBuffer *buf, const char *str, size_t n)
{
  buffer_grow(buf, n);
  memcpy(buf->data + buf->length, str, n);
  buf->length += n;
  buf->data[buf->length] = '\0';
}

static void buffer_append_char(StringBuffer *buf, char c)
{
  buffer_grow(buf, 1);
  buf->data[buf->length++] = c;
  buf->data[buf->length] = '\0';
}

static void buffer_free(StringBuffer *buf)
{
  if (buf) {
    free(buf->data);
    free(buf);
  }
}

// Check if line starts with pattern (after trimming whitespace)
static int starts_with(const char *line, const char *pattern)
{
  while (*line && isspace((unsigned char)*line)) line++;
  return strncmp(line, pattern, strlen(pattern)) == 0;
}

// Count leading # characters
static int count_heading_level(const char *line)
{
  int count = 0;
  while (*line && isspace((unsigned char)*line)) line++;
  while (line[count] == '#' && count < 6) count++;
  if (count > 0 && line[count] == ' ') return count;
  return 0;
}

// Skip whitespace
static const char *skip_whitespace(const char *str)
{
  while (*str && isspace((unsigned char)*str)) str++;
  return str;
}

// Check if line is empty (only whitespace)
static int is_empty_line(const char *line)
{
  while (*line) {
    if (!isspace((unsigned char)*line)) return 0;
    line++;
  }
  return 1;
}

// Check if line is horizontal rule (---, ***, ___)
static int is_horizontal_rule(const char *line)
{
  line = skip_whitespace(line);
  char first = *line;
  if (first != '-' && first != '*' && first != '_') return 0;

  int count = 0;
  while (*line) {
    if (*line == first) count++;
    else if (!isspace((unsigned char)*line)) return 0;
    line++;
  }
  return count >= 3;
}

// Check if line is unordered list item
static int is_unordered_list(const char *line)
{
  line = skip_whitespace(line);
  return (*line == '-' || *line == '*' || *line == '+') && line[1] == ' ';
}

// Check if line is ordered list item
static int is_ordered_list(const char *line)
{
  line = skip_whitespace(line);
  while (*line && isdigit((unsigned char)*line)) line++;
  return *line == '.' && line[1] == ' ';
}

// Process inline markdown (bold, italic, code, links, strikethrough)
static void process_inline(StringBuffer *buf, const char *text, size_t len)
{
  size_t i = 0;

  while (i < len) {
    // Links: [text](url)
    if (text[i] == '[') {
      size_t link_start = i + 1;
      size_t link_end = link_start;
      while (link_end < len && text[link_end] != ']') link_end++;

      if (link_end < len && link_end + 1 < len && text[link_end + 1] == '(') {
        size_t url_start = link_end + 2;
        size_t url_end = url_start;
        while (url_end < len && text[url_end] != ')') url_end++;

        if (url_end < len) {
          buffer_append(buf, "<a href=\"");
          buffer_append_n(buf, text + url_start, url_end - url_start);
          buffer_append(buf, "\">");
          buffer_append_n(buf, text + link_start, link_end - link_start);
          buffer_append(buf, "</a>");
          i = url_end + 1;
          continue;
        }
      }
    }

    // Images: ![alt](url)
    if (text[i] == '!' && i + 1 < len && text[i + 1] == '[') {
      size_t alt_start = i + 2;
      size_t alt_end = alt_start;
      while (alt_end < len && text[alt_end] != ']') alt_end++;

      if (alt_end < len && alt_end + 1 < len && text[alt_end + 1] == '(') {
        size_t url_start = alt_end + 2;
        size_t url_end = url_start;
        while (url_end < len && text[url_end] != ')') url_end++;

        if (url_end < len) {
          buffer_append(buf, "<img src=\"");
          buffer_append_n(buf, text + url_start, url_end - url_start);
          buffer_append(buf, "\" alt=\"");
          buffer_append_n(buf, text + alt_start, alt_end - alt_start);
          buffer_append(buf, "\">");
          i = url_end + 1;
          continue;
        }
      }
    }

    // Bold: **text** or __text__
    if ((text[i] == '*' && i + 1 < len && text[i + 1] == '*') ||
        (text[i] == '_' && i + 1 < len && text[i + 1] == '_')) {
      char marker = text[i];
      size_t start = i + 2;
      size_t end = start;
      while (end + 1 < len && !(text[end] == marker && text[end + 1] == marker)) end++;

      if (end + 1 < len) {
        buffer_append(buf, "<strong>");
        process_inline(buf, text + start, end - start);
        buffer_append(buf, "</strong>");
        i = end + 2;
        continue;
      }
    }

    // Strikethrough: ~~text~~
    if (text[i] == '~' && i + 1 < len && text[i + 1] == '~') {
      size_t start = i + 2;
      size_t end = start;
      while (end + 1 < len && !(text[end] == '~' && text[end + 1] == '~')) end++;

      if (end + 1 < len) {
        buffer_append(buf, "<del>");
        process_inline(buf, text + start, end - start);
        buffer_append(buf, "</del>");
        i = end + 2;
        continue;
      }
    }

    // Italic: *text* or _text_
    if ((text[i] == '*' || text[i] == '_') && i + 1 < len && !isspace((unsigned char)text[i + 1])) {
      char marker = text[i];
      size_t start = i + 1;
      size_t end = start;
      while (end < len && text[end] != marker) end++;

      if (end < len && end > start) {
        buffer_append(buf, "<em>");
        process_inline(buf, text + start, end - start);
        buffer_append(buf, "</em>");
        i = end + 1;
        continue;
      }
    }

    // Inline code: `code`
    if (text[i] == '`') {
      size_t start = i + 1;
      size_t end = start;
      while (end < len && text[end] != '`') end++;

      if (end < len) {
        buffer_append(buf, "<code>");
        buffer_append_n(buf, text + start, end - start);
        buffer_append(buf, "</code>");
        i = end + 1;
        continue;
      }
    }

    // This might not be needed for now.
    // HTML escape special characters
    // if (text[i] == '<') {
    //   buffer_append(buf, "&lt;");
    // } else if (text[i] == '>') {
    //   buffer_append(buf, "&gt;");
    // } else if (text[i] == '&') {
    //   buffer_append(buf, "&amp;");
    // } else {
    //   buffer_append_char(buf, text[i]);
    // }
    buffer_append_char(buf, text[i]);
    i++;
  }
}

// Convert markdown to HTML
MDAPI char *markdown_to_html(const char *markdown)
{
  if (!markdown) return NULL;

  StringBuffer *buf = buffer_create(INITIAL_BUFFER_SIZE);
  if (!buf) return NULL;

  const char *ptr = markdown;
  const char *line_start;

  while (*ptr) {
    line_start = ptr;

    // Find end of line
    while (*ptr && *ptr != '\n') ptr++;
    size_t line_len = ptr - line_start;

    // Create null-terminated line copy
    char *line = (char *)malloc(line_len + 1);
    if (!line) {
      buffer_free(buf);
      return NULL;
    }
    memcpy(line, line_start, line_len);
    line[line_len] = '\0';

    // Skip empty lines
    if (is_empty_line(line)) {
      free(line);
      if (*ptr == '\n') ptr++;
      continue;
    }

    // Headings: # H1, ## H2, etc.
    int heading_level = count_heading_level(line);
    if (heading_level > 0) {
      const char *content = skip_whitespace(line);
      while (*content == '#') content++;
      content = skip_whitespace(content);

      char tag[8];
      snprintf(tag, sizeof(tag), "<h%d>", heading_level);
      buffer_append(buf, tag);
      process_inline(buf, content, strlen(content));
      snprintf(tag, sizeof(tag), "</h%d>", heading_level);
      buffer_append(buf, tag);

      free(line);
      if (*ptr == '\n') ptr++;
      continue;
    }

    // Code block: ```
    if (starts_with(line, "```")) {
      buffer_append(buf, "<pre><code>");
      free(line);
      if (*ptr == '\n') ptr++;

      // Collect code content
      while (*ptr) {
        line_start = ptr;
        while (*ptr && *ptr != '\n') ptr++;
        line_len = ptr - line_start;

        line = (char *)malloc(line_len + 1);
        if (!line) break;
        memcpy(line, line_start, line_len);
        line[line_len] = '\0';

        if (starts_with(line, "```")) {
          free(line);
          if (*ptr == '\n') ptr++;
          break;
        }

        // Escape HTML in code blocks
        for (size_t i = 0; i < line_len; i++) {
          if (line[i] == '<') buffer_append(buf, "&lt;");
          else if (line[i] == '>') buffer_append(buf, "&gt;");
          else if (line[i] == '&') buffer_append(buf, "&amp;");
          else buffer_append_char(buf, line[i]);
        }
        buffer_append_char(buf, '\n');

        free(line);
        if (*ptr == '\n') ptr++;
      }

      buffer_append(buf, "</code></pre>");
      continue;
    }

    // Blockquote: >
    if (starts_with(line, ">")) {
      buffer_append(buf, "<blockquote>");

      while (1) {
        const char *content = skip_whitespace(line);
        if (*content == '>') content++;
        content = skip_whitespace(content);
        process_inline(buf, content, strlen(content));
        buffer_append_char(buf, ' ');

        free(line);
        if (*ptr == '\n') ptr++;

        // Check next line
        if (!*ptr) break;
        line_start = ptr;
        while (*ptr && *ptr != '\n') ptr++;
        line_len = ptr - line_start;

        line = (char *)malloc(line_len + 1);
        if (!line) break;
        memcpy(line, line_start, line_len);
        line[line_len] = '\0';

        if (!starts_with(line, ">")) {
          // Put back the line pointer
          ptr = line_start;
          free(line);
          break;
        }
      }

      buffer_append(buf, "</blockquote>");
      continue;
    }

    // Horizontal rule
    if (is_horizontal_rule(line)) {
      buffer_append(buf, "<hr>");
      free(line);
      if (*ptr == '\n') ptr++;
      continue;
    }

    // Unordered list
    if (is_unordered_list(line)) {
      buffer_append(buf, "<ul>");

      while (1) {
        const char *content = skip_whitespace(line);
        content += 2; // Skip "- " or "* " or "+ "

        buffer_append(buf, "<li>");
        process_inline(buf, content, strlen(content));
        buffer_append(buf, "</li>");

        free(line);
        if (*ptr == '\n') ptr++;

        // Check next line
        if (!*ptr) break;
        line_start = ptr;
        while (*ptr && *ptr != '\n') ptr++;
        line_len = ptr - line_start;

        line = (char *)malloc(line_len + 1);
        if (!line) break;
        memcpy(line, line_start, line_len);
        line[line_len] = '\0';

        if (!is_unordered_list(line)) {
          ptr = line_start;
          free(line);
          break;
        }
      }

      buffer_append(buf, "</ul>");
      continue;
    }

    // Ordered list
    if (is_ordered_list(line)) {
      buffer_append(buf, "<ol>");

      while (1) {
        const char *content = skip_whitespace(line);
        while (*content && isdigit((unsigned char)*content)) content++;
        if (*content == '.') content++;
        content = skip_whitespace(content);

        buffer_append(buf, "<li>");
        process_inline(buf, content, strlen(content));
        buffer_append(buf, "</li>");

        free(line);
        if (*ptr == '\n') ptr++;

        // Check next line
        if (!*ptr) break;
        line_start = ptr;
        while (*ptr && *ptr != '\n') ptr++;
        line_len = ptr - line_start;

        line = (char *)malloc(line_len + 1);
        if (!line) break;
        memcpy(line, line_start, line_len);
        line[line_len] = '\0';

        if (!is_ordered_list(line)) {
          ptr = line_start;
          free(line);
          break;
        }
      }

      buffer_append(buf, "</ol>");
      continue;
    }

    // Regular paragraph
    buffer_append(buf, "<p>");

    while (1) {
      const char *content = skip_whitespace(line);
      process_inline(buf, content, strlen(content));

      free(line);
      if (*ptr == '\n') ptr++;

      // Check next line - continue paragraph if not special
      if (!*ptr) break;
      line_start = ptr;
      while (*ptr && *ptr != '\n') ptr++;
      line_len = ptr - line_start;

      line = (char *)malloc(line_len + 1);
      if (!line) break;
      memcpy(line, line_start, line_len);
      line[line_len] = '\0';

      if (is_empty_line(line) ||
          count_heading_level(line) > 0 ||
          starts_with(line, "```") ||
          starts_with(line, ">") ||
          is_horizontal_rule(line) ||
          is_unordered_list(line) ||
          is_ordered_list(line)) {
        ptr = line_start;
        free(line);
        break;
      }

      buffer_append_char(buf, ' ');
    }

    buffer_append(buf, "</p>");
  }

  char *result = buf->data;
  free(buf); // Free struct but not data
  return result;
}

// Free the returned HTML string
MDAPI void markdown_free(char *html)
{
  free(html);
}

// Get length of HTML string (for WASM memory allocation)
MDAPI size_t markdown_get_length(const char *html)
{
  return html ? strlen(html) : 0;
}