Mercurial
view markdown_converter/markdown_to_html.c @ 205:e07b4b5a66bb
Bad named files.
| author | MrJuneJune <me@mrjunejune.com> |
|---|---|
| date | Sun, 15 Feb 2026 11:07:52 -0800 |
| parents | a2725419f988 |
| children |
line wrap: on
line source
#include <string.h> #include <stdlib.h> #include <stdio.h> #include <ctype.h> #include "markdown_converter/markdown_to_html.h" // JavaScript needs this to free the memory later MDAPI void *wasm_alloc(size_t size) { return malloc(size); } MDAPI void wasm_free(void* ptr) { free(ptr); } #define INITIAL_BUFFER_SIZE 1024 * 1024 // 1MB // String buffer for building HTML output typedef struct { char *data; size_t length; size_t capacity; } StringBuffer; static StringBuffer *buffer_create(size_t initial_capacity) { StringBuffer *buf = (StringBuffer *)malloc(sizeof(StringBuffer)); if (!buf) return NULL; buf->data = (char *)malloc(initial_capacity); if (!buf->data) { free(buf); return NULL; } buf->data[0] = '\0'; buf->length = 0; buf->capacity = initial_capacity; return buf; } static void buffer_grow(StringBuffer *buf, size_t needed) { if (buf->length + needed + 1 > buf->capacity) { size_t new_capacity = buf->capacity * 2; while (new_capacity < buf->length + needed + 1) new_capacity *= 2; char *new_data = (char *)realloc(buf->data, new_capacity); if (new_data) { buf->data = new_data; buf->capacity = new_capacity; } } } static void buffer_append(StringBuffer *buf, const char *str) { size_t len = strlen(str); buffer_grow(buf, len); memcpy(buf->data + buf->length, str, len + 1); buf->length += len; } static void buffer_append_n(StringBuffer *buf, const char *str, size_t n) { buffer_grow(buf, n); memcpy(buf->data + buf->length, str, n); buf->length += n; buf->data[buf->length] = '\0'; } static void buffer_append_char(StringBuffer *buf, char c) { buffer_grow(buf, 1); buf->data[buf->length++] = c; buf->data[buf->length] = '\0'; } static void buffer_free(StringBuffer *buf) { if (buf) { free(buf->data); free(buf); } } // Forward declaration static void process_inline(StringBuffer *buf, const char *text, size_t len); // Check if line starts with pattern (after trimming whitespace) static int starts_with(const char *line, const char *pattern) { while (*line && isspace((unsigned char)*line)) line++; return strncmp(line, pattern, strlen(pattern)) == 0; } // Count leading # characters static int count_heading_level(const char *line) { int count = 0; while (*line && isspace((unsigned char)*line)) line++; while (line[count] == '#' && count < 6) count++; if (count > 0 && line[count] == ' ') return count; return 0; } // Skip whitespace static const char *skip_whitespace(const char *str) { while (*str && isspace((unsigned char)*str)) str++; return str; } // Check if line is empty (only whitespace) static int is_empty_line(const char *line) { while (*line) { if (!isspace((unsigned char)*line)) return 0; line++; } return 1; } // Check if line is horizontal rule (---, ***, ___) static int is_horizontal_rule(const char *line) { line = skip_whitespace(line); char first = *line; if (first != '-' && first != '*' && first != '_') return 0; int count = 0; while (*line) { if (*line == first) count++; else if (!isspace((unsigned char)*line)) return 0; line++; } return count >= 3; } // Check if line is unordered list item static int is_unordered_list(const char *line) { line = skip_whitespace(line); return (*line == '-' || *line == '*' || *line == '+') && line[1] == ' '; } // Check if line starts with an HTML tag static int is_html_block_start(const char *line) { line = skip_whitespace(line); if (*line != '<') return 0; line++; // Check for closing tag or comment if (*line == '/' || *line == '!') return 1; // Check for valid tag name (letter followed by alphanumeric) if (!isalpha((unsigned char)*line)) return 0; return 1; } // Check if line starts with a specific HTML tag (e.g., "script", "style") static int is_html_tag(const char *line, const char *tag) { line = skip_whitespace(line); if (*line != '<') return 0; line++; // Skip optional / int is_closing = 0; if (*line == '/') { is_closing = 1; line++; } size_t tag_len = strlen(tag); if (strncasecmp(line, tag, tag_len) != 0) return 0; char next = line[tag_len]; // Tag must be followed by space, >, or end for closing tags return next == '>' || next == ' ' || next == '\t' || next == '\n' || next == '\0'; } // Check if line is ordered list item static int is_ordered_list(const char *line) { line = skip_whitespace(line); while (*line && isdigit((unsigned char)*line)) line++; return *line == '.' && line[1] == ' '; } // Check if line could be a table row (contains |) static int is_table_row(const char *line) { line = skip_whitespace(line); // Must contain at least one | return strchr(line, '|') != NULL; } // Check if line is a table separator (|---|---|) static int is_table_separator(const char *line) { line = skip_whitespace(line); int has_dash = 0; int has_pipe = 0; while (*line) { char c = *line; if (c == '|') has_pipe = 1; else if (c == '-') has_dash = 1; else if (c == ':') ; // alignment marker, allowed else if (isspace((unsigned char)c)) ; // whitespace allowed else return 0; // invalid character for separator line++; } return has_dash && has_pipe; } // Parse alignment from separator cell (e.g., ":---:", "---:", ":---") // Returns: 0 = left (default), 1 = center, 2 = right static int parse_alignment(const char *cell, size_t len) { // Trim whitespace while (len > 0 && isspace((unsigned char)*cell)) { cell++; len--; } while (len > 0 && isspace((unsigned char)cell[len-1])) { len--; } if (len == 0) return 0; int left_colon = (cell[0] == ':'); int right_colon = (len > 0 && cell[len-1] == ':'); if (left_colon && right_colon) return 1; // center if (right_colon) return 2; // right return 0; // left (default) } // Count columns in a table row static int count_table_columns(const char *line) { int count = 0; int in_cell = 0; line = skip_whitespace(line); // Skip leading | if (*line == '|') line++; while (*line) { if (*line == '|') { count++; in_cell = 0; } else if (!isspace((unsigned char)*line)) { in_cell = 1; } line++; } // Count last cell if there was content after last | if (in_cell) count++; return count > 0 ? count : 1; } // Parse table cells and call callback for each typedef void (*cell_callback)(StringBuffer *buf, const char *cell, size_t len, int align, int is_header); static void parse_table_row(StringBuffer *buf, const char *line, int *alignments, int num_cols, int is_header, cell_callback cb) { line = skip_whitespace(line); // Skip leading | if (*line == '|') line++; int col = 0; const char *cell_start = line; while (*line && col < num_cols) { if (*line == '|' || *(line + 1) == '\0') { // End of cell size_t cell_len = line - cell_start; if (*line != '|') cell_len++; // include last char if no trailing | // Trim whitespace from cell while (cell_len > 0 && isspace((unsigned char)*cell_start)) { cell_start++; cell_len--; } while (cell_len > 0 && isspace((unsigned char)cell_start[cell_len-1])) { cell_len--; } int align = (alignments && col < num_cols) ? alignments[col] : 0; cb(buf, cell_start, cell_len, align, is_header); col++; cell_start = line + 1; } line++; } // Fill remaining columns with empty cells while (col < num_cols) { cb(buf, "", 0, alignments ? alignments[col] : 0, is_header); col++; } } static void emit_table_cell(StringBuffer *buf, const char *cell, size_t len, int align, int is_header) { const char *tag = is_header ? "th" : "td"; const char *align_attr = ""; if (align == 1) align_attr = " style=\"text-align:center\""; else if (align == 2) align_attr = " style=\"text-align:right\""; buffer_append(buf, "<"); buffer_append(buf, tag); buffer_append(buf, align_attr); buffer_append(buf, ">"); process_inline(buf, cell, len); buffer_append(buf, "</"); buffer_append(buf, tag); buffer_append(buf, ">"); } // Parse alignments from separator row static void parse_alignments(const char *line, int *alignments, int num_cols) { line = skip_whitespace(line); if (*line == '|') line++; int col = 0; const char *cell_start = line; while (*line && col < num_cols) { if (*line == '|' || *(line + 1) == '\0') { size_t cell_len = line - cell_start; if (*line != '|') cell_len++; alignments[col] = parse_alignment(cell_start, cell_len); col++; cell_start = line + 1; } line++; } } // Process inline markdown (bold, italic, code, links, strikethrough) static void process_inline(StringBuffer *buf, const char *text, size_t len) { size_t i = 0; while (i < len) { // Links: [text](url) if (text[i] == '[') { size_t link_start = i + 1; size_t link_end = link_start; while (link_end < len && text[link_end] != ']') link_end++; if (link_end < len && link_end + 1 < len && text[link_end + 1] == '(') { size_t url_start = link_end + 2; size_t url_end = url_start; while (url_end < len && text[url_end] != ')') url_end++; if (url_end < len) { buffer_append(buf, "<a href=\""); buffer_append_n(buf, text + url_start, url_end - url_start); buffer_append(buf, "\">"); buffer_append_n(buf, text + link_start, link_end - link_start); buffer_append(buf, "</a>"); i = url_end + 1; continue; } } } // Images:  if (text[i] == '!' && i + 1 < len && text[i + 1] == '[') { size_t alt_start = i + 2; size_t alt_end = alt_start; while (alt_end < len && text[alt_end] != ']') alt_end++; if (alt_end < len && alt_end + 1 < len && text[alt_end + 1] == '(') { size_t url_start = alt_end + 2; size_t url_end = url_start; while (url_end < len && text[url_end] != ')') url_end++; if (url_end < len) { buffer_append(buf, "<img src=\""); buffer_append_n(buf, text + url_start, url_end - url_start); buffer_append(buf, "\" alt=\""); buffer_append_n(buf, text + alt_start, alt_end - alt_start); buffer_append(buf, "\">"); i = url_end + 1; continue; } } } // Bold: **text** or __text__ if ((text[i] == '*' && i + 1 < len && text[i + 1] == '*') || (text[i] == '_' && i + 1 < len && text[i + 1] == '_')) { char marker = text[i]; size_t start = i + 2; size_t end = start; while (end + 1 < len && !(text[end] == marker && text[end + 1] == marker)) end++; if (end + 1 < len) { buffer_append(buf, "<strong>"); process_inline(buf, text + start, end - start); buffer_append(buf, "</strong>"); i = end + 2; continue; } } // Strikethrough: ~~text~~ if (text[i] == '~' && i + 1 < len && text[i + 1] == '~') { size_t start = i + 2; size_t end = start; while (end + 1 < len && !(text[end] == '~' && text[end + 1] == '~')) end++; if (end + 1 < len) { buffer_append(buf, "<del>"); process_inline(buf, text + start, end - start); buffer_append(buf, "</del>"); i = end + 2; continue; } } // Italic: *text* or _text_ if ((text[i] == '*' || text[i] == '_') && i + 1 < len && !isspace((unsigned char)text[i + 1])) { char marker = text[i]; size_t start = i + 1; size_t end = start; while (end < len && text[end] != marker) end++; if (end < len && end > start) { buffer_append(buf, "<em>"); process_inline(buf, text + start, end - start); buffer_append(buf, "</em>"); i = end + 1; continue; } } // Inline code: `code` if (text[i] == '`') { size_t start = i + 1; size_t end = start; while (end < len && text[end] != '`') end++; if (end < len) { buffer_append(buf, "<code>"); buffer_append_n(buf, text + start, end - start); buffer_append(buf, "</code>"); i = end + 1; continue; } } // This might not be needed for now. // HTML escape special characters // if (text[i] == '<') { // buffer_append(buf, "<"); // } else if (text[i] == '>') { // buffer_append(buf, ">"); // } else if (text[i] == '&') { // buffer_append(buf, "&"); // } else { // buffer_append_char(buf, text[i]); // } buffer_append_char(buf, text[i]); i++; } } // Convert markdown to HTML MDAPI char *markdown_to_html(const char *markdown) { if (!markdown) return NULL; StringBuffer *buf = buffer_create(INITIAL_BUFFER_SIZE); if (!buf) return NULL; const char *ptr = markdown; const char *line_start; while (*ptr) { line_start = ptr; // Find end of line while (*ptr && *ptr != '\n') ptr++; size_t line_len = ptr - line_start; // Create null-terminated line copy char *line = (char *)malloc(line_len + 1); if (!line) { buffer_free(buf); return NULL; } memcpy(line, line_start, line_len); line[line_len] = '\0'; // Skip empty lines if (is_empty_line(line)) { free(line); if (*ptr == '\n') ptr++; continue; } // Headings: # H1, ## H2, etc. int heading_level = count_heading_level(line); if (heading_level > 0) { const char *content = skip_whitespace(line); while (*content == '#') content++; content = skip_whitespace(content); char tag[8]; snprintf(tag, sizeof(tag), "<h%d>", heading_level); buffer_append(buf, tag); process_inline(buf, content, strlen(content)); snprintf(tag, sizeof(tag), "</h%d>", heading_level); buffer_append(buf, tag); free(line); if (*ptr == '\n') ptr++; continue; } // Code block: ``` if (starts_with(line, "```")) { buffer_append(buf, "<pre><code>"); free(line); if (*ptr == '\n') ptr++; // Collect code content while (*ptr) { line_start = ptr; while (*ptr && *ptr != '\n') ptr++; line_len = ptr - line_start; line = (char *)malloc(line_len + 1); if (!line) break; memcpy(line, line_start, line_len); line[line_len] = '\0'; if (starts_with(line, "```")) { free(line); if (*ptr == '\n') ptr++; break; } // Escape HTML in code blocks for (size_t i = 0; i < line_len; i++) { if (line[i] == '<') buffer_append(buf, "<"); else if (line[i] == '>') buffer_append(buf, ">"); else if (line[i] == '&') buffer_append(buf, "&"); else buffer_append_char(buf, line[i]); } buffer_append_char(buf, '\n'); free(line); if (*ptr == '\n') ptr++; } buffer_append(buf, "</code></pre>"); continue; } // Blockquote: > if (starts_with(line, ">")) { buffer_append(buf, "<blockquote>"); while (1) { const char *content = skip_whitespace(line); if (*content == '>') content++; content = skip_whitespace(content); process_inline(buf, content, strlen(content)); buffer_append_char(buf, ' '); free(line); if (*ptr == '\n') ptr++; // Check next line if (!*ptr) break; line_start = ptr; while (*ptr && *ptr != '\n') ptr++; line_len = ptr - line_start; line = (char *)malloc(line_len + 1); if (!line) break; memcpy(line, line_start, line_len); line[line_len] = '\0'; if (!starts_with(line, ">")) { // Put back the line pointer ptr = line_start; free(line); break; } } buffer_append(buf, "</blockquote>"); continue; } // Horizontal rule if (is_horizontal_rule(line)) { buffer_append(buf, "<hr>"); free(line); if (*ptr == '\n') ptr++; continue; } // Unordered list if (is_unordered_list(line)) { buffer_append(buf, "<ul>"); while (1) { const char *content = skip_whitespace(line); content += 2; // Skip "- " or "* " or "+ " buffer_append(buf, "<li>"); process_inline(buf, content, strlen(content)); buffer_append(buf, "</li>"); free(line); if (*ptr == '\n') ptr++; // Check next line if (!*ptr) break; line_start = ptr; while (*ptr && *ptr != '\n') ptr++; line_len = ptr - line_start; line = (char *)malloc(line_len + 1); if (!line) break; memcpy(line, line_start, line_len); line[line_len] = '\0'; if (!is_unordered_list(line)) { ptr = line_start; free(line); break; } } buffer_append(buf, "</ul>"); continue; } // Ordered list if (is_ordered_list(line)) { buffer_append(buf, "<ol>"); while (1) { const char *content = skip_whitespace(line); while (*content && isdigit((unsigned char)*content)) content++; if (*content == '.') content++; content = skip_whitespace(content); buffer_append(buf, "<li>"); process_inline(buf, content, strlen(content)); buffer_append(buf, "</li>"); free(line); if (*ptr == '\n') ptr++; // Check next line if (!*ptr) break; line_start = ptr; while (*ptr && *ptr != '\n') ptr++; line_len = ptr - line_start; line = (char *)malloc(line_len + 1); if (!line) break; memcpy(line, line_start, line_len); line[line_len] = '\0'; if (!is_ordered_list(line)) { ptr = line_start; free(line); break; } } buffer_append(buf, "</ol>"); continue; } // Table: | col1 | col2 | followed by |---|---| if (is_table_row(line)) { // Peek at next line to see if it's a separator const char *peek_ptr = ptr; if (*peek_ptr == '\n') peek_ptr++; const char *next_line_start = peek_ptr; while (*peek_ptr && *peek_ptr != '\n') peek_ptr++; size_t next_line_len = peek_ptr - next_line_start; char *next_line = (char *)malloc(next_line_len + 1); if (next_line) { memcpy(next_line, next_line_start, next_line_len); next_line[next_line_len] = '\0'; if (is_table_separator(next_line)) { // It's a table! int num_cols = count_table_columns(line); int *alignments = (int *)calloc(num_cols, sizeof(int)); buffer_append(buf, "<table>"); // Header row buffer_append(buf, "<thead><tr>"); parse_table_row(buf, line, NULL, num_cols, 1, emit_table_cell); buffer_append(buf, "</tr></thead>"); free(line); if (*ptr == '\n') ptr++; // Parse alignments from separator parse_alignments(next_line, alignments, num_cols); free(next_line); // Skip separator line ptr = peek_ptr; if (*ptr == '\n') ptr++; // Body rows buffer_append(buf, "<tbody>"); while (*ptr) { line_start = ptr; while (*ptr && *ptr != '\n') ptr++; line_len = ptr - line_start; line = (char *)malloc(line_len + 1); if (!line) break; memcpy(line, line_start, line_len); line[line_len] = '\0'; if (!is_table_row(line) || is_empty_line(line)) { ptr = line_start; free(line); break; } buffer_append(buf, "<tr>"); parse_table_row(buf, line, alignments, num_cols, 0, emit_table_cell); buffer_append(buf, "</tr>"); free(line); if (*ptr == '\n') ptr++; } buffer_append(buf, "</tbody></table>"); free(alignments); continue; } free(next_line); } } // HTML block - pass through unchanged if (is_html_block_start(line)) { // Check if it's a script or style tag that needs special handling int is_script = is_html_tag(line, "script"); int is_style = is_html_tag(line, "style"); if (is_script || is_style) { const char *end_tag = is_script ? "</script>" : "</style>"; // Output the opening line buffer_append(buf, line); buffer_append_char(buf, '\n'); free(line); if (*ptr == '\n') ptr++; // Collect content until closing tag while (*ptr) { line_start = ptr; while (*ptr && *ptr != '\n') ptr++; line_len = ptr - line_start; line = (char *)malloc(line_len + 1); if (!line) break; memcpy(line, line_start, line_len); line[line_len] = '\0'; buffer_append(buf, line); buffer_append_char(buf, '\n'); int found_end = (strstr(line, end_tag) != NULL); free(line); if (*ptr == '\n') ptr++; if (found_end) break; } continue; } // Regular HTML tag - just pass through the line buffer_append(buf, line); buffer_append_char(buf, '\n'); free(line); if (*ptr == '\n') ptr++; continue; } // Regular paragraph buffer_append(buf, "<p>"); while (1) { const char *content = skip_whitespace(line); process_inline(buf, content, strlen(content)); free(line); if (*ptr == '\n') ptr++; // Check next line - continue paragraph if not special if (!*ptr) break; line_start = ptr; while (*ptr && *ptr != '\n') ptr++; line_len = ptr - line_start; line = (char *)malloc(line_len + 1); if (!line) break; memcpy(line, line_start, line_len); line[line_len] = '\0'; if (is_empty_line(line) || count_heading_level(line) > 0 || starts_with(line, "```") || starts_with(line, ">") || is_horizontal_rule(line) || is_unordered_list(line) || is_ordered_list(line) || is_table_row(line) || is_html_block_start(line)) { ptr = line_start; free(line); break; } buffer_append_char(buf, ' '); } buffer_append(buf, "</p>"); } char *result = buf->data; free(buf); // Free struct but not data return result; } // Free the returned HTML string MDAPI void markdown_free(char *html) { free(html); } // Get length of HTML string (for WASM memory allocation) MDAPI size_t markdown_get_length(const char *html) { return html ? strlen(html) : 0; }