dataframe-cleaner / src /App.svelte
hmb's picture
hmb HF Staff
Update src/App.svelte
ae0c97e verified
<script lang="ts">
import Dataframe from '@gradio/dataframe';
import { pipeline } from '@xenova/transformers';
let rawData = '';
let cleanedData = '';
let cleaningSteps: any[] = [];
let showSteps = false;
let fileInput: HTMLInputElement;
let inputValue: { data: string[][]; headers: string[] } = { data: [[]], headers: [] };
let cleanedValue: { data: string[][]; headers: string[] } = { data: [[]], headers: [] };
// Cleaning UI state
let cleaningInProgress = false;
let cleaningError: string | null = null;
let modelWarning: string | null = null;
let progressCurrent = 0;
let progressTotal = 0;
// Cleaning toggles
let optDedupeRows = true;
let optDedupeColumns = false;
let optRemoveSymbols = true;
let optCollapseSpaces = true;
let optHtmlToText = true;
// Extra LLM instructions to augment standard cleaning
let extraInstructions: string = '';
function parseCSVorTSV(text: string) {
if (!text) return { data: [[]], headers: [] };
const lines = text.trim().split(/\r?\n/);
if (lines.length === 0) return { data: [[]], headers: [] };
const sep = lines[0].includes('\t') ? '\t' : ',';
const headers = lines[0].split(sep).map(h => h.trim());
const data = lines.slice(1).map(line => line.split(sep).map(cell => cell.trim()));
return normalizeTable({ data, headers });
}
function hasNonEmptyTable(value: { data: string[][]; headers: string[] } | null | undefined): boolean {
if (!value) return false;
const headersOk = Array.isArray(value.headers) && value.headers.length > 0;
const rowsOk = Array.isArray(value.data) && value.data.length > 0;
if (!headersOk || !rowsOk) return false;
// At least one non-empty cell
return value.data.some((row) => Array.isArray(row) && row.some((cell) => String(cell ?? '').trim() !== ''));
}
function normalizeTable(value: { data: unknown[][]; headers: unknown[] } | null | undefined): { data: string[][]; headers: string[] } {
if (!value || !Array.isArray(value.headers) || value.headers.length === 0) {
return { data: [], headers: [] };
}
const headers: string[] = value.headers.map((h) => String(h ?? ''));
const width = headers.length;
const data: string[][] = (Array.isArray(value.data) ? value.data : []).map((row) => {
const safeRow = Array.isArray(row) ? row : [];
const padded: string[] = Array.from({ length: width }, (_, i) => String(safeRow[i] ?? ''));
return padded;
});
return { headers, data };
}
function padRowToWidth(row: unknown[] | null | undefined, width: number): string[] {
const safeRow = Array.isArray(row) ? row : [];
return Array.from({ length: width }, (_, i) => String(safeRow[i] ?? ''));
}
function htmlToText(input: string): string {
return input.replace(/<[^>]*>/g, '');
}
function removeSymbols(input: string): string {
// Keep letters, numbers, common punctuation; remove emojis and other symbols
return input
.replace(/[\p{Extended_Pictographic}]/gu, '')
.replace(/[\u200B-\u200D\uFEFF]/g, '')
.replace(/[^\p{L}\p{N}\s\-_'",.;:!?()\[\]{}@#$%&*/+\\=<>^~`|]/gu, '');
}
function collapseSpaces(input: string): string {
return input.replace(/\s+/g, ' ').trim();
}
function dedupeRows(headers: string[], rows: string[][]): string[][] {
const seen = new Set<string>();
const out: string[][] = [];
for (const r of rows) {
const key = r.join('\u0001');
if (!seen.has(key)) {
seen.add(key);
out.push(padRowToWidth(r, headers.length));
}
}
return out;
}
function dedupeColumns(headers: string[], rows: string[][]): { headers: string[]; rows: string[][] } {
const seen = new Set<string>();
const keepIdx: number[] = [];
const outHeaders: string[] = [];
headers.forEach((h, i) => {
if (!seen.has(h)) {
seen.add(h);
keepIdx.push(i);
outHeaders.push(h);
}
});
const outRows = rows.map((r) => keepIdx.map((i) => r[i] ?? ''));
return { headers: outHeaders, rows: outRows };
}
function updateInputValueFromRaw() {
inputValue = parseCSVorTSV(rawData);
}
function updateRawFromInputValue() {
// Convert inputValue back to CSV string
if (!inputValue.headers.length) return;
const sep = ',';
const lines = [inputValue.headers.join(sep), ...inputValue.data.map(row => row.join(sep))];
rawData = lines.join('\n');
}
function handleFileUpload(event: Event) {
const files = (event.target as HTMLInputElement).files;
if (files && files.length > 0) {
const reader = new FileReader();
reader.onload = (e) => {
rawData = e.target?.result as string;
updateInputValueFromRaw();
};
reader.readAsText(files[0]);
}
}
function handleInputChange(e: CustomEvent) {
inputValue = normalizeTable(e.detail);
updateRawFromInputValue();
}
let textCleaner: any = null;
let loadingModel = false;
// Heuristic fallback cleaner if model output is unusable
function fallbackClean(text: string): string {
if (!text) return '';
// Remove URLs
let out = text.replace(/https?:\/\/\S+/g, '');
// Remove emojis and non-text pictographs
out = out.replace(/[\p{Extended_Pictographic}]/gu, '');
// Remove HTML tags and zero-width characters
out = out.replace(/<[^>]*>/g, '').replace(/[\u200B-\u200D\uFEFF]/g, '');
// Normalize quotes and whitespace
out = out
.replace(/[\u2018\u2019\u201A\u201B]/g, "'")
.replace(/[\u201C\u201D\u201E\u201F]/g, '"')
.replace(/[\u2013\u2014]/g, '-')
.normalize('NFC')
.replace(/"{2,}/g, '"')
.replace(/'{2,}/g, "'")
.replace(/[!?.]{2,}/g, (m: string) => m[0])
.replace(/\s+/g, ' ')
.trim();
// Remove stray leading 'clean:' tokens if any echoed
out = out.replace(/^\s*clean:\s*/i, '');
return out;
}
function isLikelyNumericOrDate(text: string): boolean {
const t = (text || '').trim();
if (!t) return false;
// Currency/number like $12.00, 12, 12.0 USD
if (/^[\p{Sc}]?\s?\d{1,3}(?:[.,]\d{3})*(?:[.,]\d+)?(?:\s?(?:USD|EUR|GBP|JPY|AUD|CAD))?$/iu.test(t)) return true;
// Date-like: YYYY-MM-DD, YYYY/MM/DD, DD-MM-YYYY, MM-DD-YYYY, with ., / or -
if (/^(?:\d{4}[\-\/.]\d{2}[\-\/.]\d{2}|\d{2}[\-\/.]\d{2}[\-\/.]\d{4})$/.test(t)) return true;
return false;
}
function jaccardTokenSimilarity(a: string, b: string): number {
const toTokens = (s: string) => (s.match(/[\p{L}\p{N}\-']+/gu) ?? []).map((t) => t.toLowerCase());
const A = new Set(toTokens(a));
const B = new Set(toTokens(b));
if (A.size === 0 && B.size === 0) return 1;
let inter = 0;
for (const t of A) if (B.has(t)) inter += 1;
const union = new Set([...A, ...B]).size;
return union === 0 ? 0 : inter / union;
}
function isPlausibleClean(source: string, candidate: string): boolean {
if (!candidate) return false;
const len = candidate.length;
const srcLen = source.length;
if (len > Math.max(8, Math.floor(srcLen * 1.4)) || len < Math.floor(srcLen * 0.4)) return false;
const punctOnly = candidate.replace(/[\p{L}\p{N}]/gu, '').length / Math.max(1, len) > 0.35;
if (punctOnly) return false;
const sim = jaccardTokenSimilarity(source, candidate);
if (sim < 0.5) return false;
return true;
}
async function cleanTextWithModel(text: string): Promise<string> {
if (!text || !text.trim()) return '';
// Skip model for numeric/date-ish content
if (isLikelyNumericOrDate(text)) return fallbackClean(text);
if (!textCleaner) {
loadingModel = true;
// Use a small instruction-capable text2text model for deterministic edits
// If unavailable, this will throw and we fallback deterministically.
textCleaner = await pipeline('text2text-generation', 'Xenova/t5-small');
loadingModel = false;
}
const prompt =
'clean: ' +
text
.replace(/https?:\/\/\S+/g, '')
.replace(/[\p{Extended_Pictographic}]/gu, '')
.replace(/\s+/g, ' ')
.trim() +
(extraInstructions && extraInstructions.trim().length > 0
? `\nrules: ${extraInstructions.trim()}`
: '');
try {
const output = await textCleaner(prompt, {
max_new_tokens: 64,
temperature: 0,
do_sample: false
});
const generated = output?.[0]?.generated_text ?? '';
let candidate = generated.split('\n')[0].trim().replace(/^"|"$/g, '');
// Normalize whitespace and quotes
candidate = candidate
.replace(/^\s*clean:\s*/i, '')
.replace(/\bclean:\s*/gi, '')
.replace(/[\u2018\u2019\u201A\u201B]/g, "'")
.replace(/[\u201C\u201D\u201E\u201F]/g, '"')
.replace(/[\u2013\u2014]/g, '-')
.replace(/<[^>]*>/g, '')
.replace(/[\u200B-\u200D\uFEFF]/g, '')
.replace(/"{2,}/g, '"')
.replace(/'{2,}/g, "'")
.replace(/[!?.]{2,}/g, (m: string) => m[0])
.replace(/\s+/g, ' ')
.trim();
if (!isPlausibleClean(text, candidate)) return fallbackClean(text);
return candidate;
} catch (e) {
return fallbackClean(text);
}
}
async function analyzeAndClean() {
cleaningError = null;
modelWarning = null;
showSteps = false;
cleaningSteps = [];
cleanedValue = { data: [[]], headers: [] };
cleanedData = '';
// Ensure there is a table to clean
if (!inputValue.headers?.length) {
cleaningError = 'No headers detected. Please paste a table with headers in the first row.';
return;
}
// Start with current headers/rows
let outHeaders = [...inputValue.headers];
let workingRows = inputValue.data.map((r) => padRowToWidth(r, outHeaders.length));
// Deterministic transforms applied first
if (optDedupeColumns) {
const dc = dedupeColumns(outHeaders, workingRows);
outHeaders = dc.headers;
workingRows = dc.rows.map((r) => padRowToWidth(r, outHeaders.length));
}
if (optDedupeRows) {
workingRows = dedupeRows(outHeaders, workingRows);
}
// Iterate rows
cleaningInProgress = true;
progressCurrent = 0;
const numRows = workingRows.length;
const numCols = outHeaders.length;
progressTotal = numRows * numCols;
const outRows: string[][] = [];
for (const row of workingRows) {
const baseRow = padRowToWidth(row, numCols);
const cleanedCells: string[] = [];
for (let c = 0; c < numCols; c += 1) {
let cell = (baseRow?.[c] ?? '').toString();
// Apply deterministic cell transforms
if (optHtmlToText) cell = htmlToText(cell);
if (optRemoveSymbols) cell = removeSymbols(cell);
if (optCollapseSpaces) cell = collapseSpaces(cell);
let cleaned = '';
try {
cleaned = await cleanTextWithModel(cell);
} catch (e) {
if (!modelWarning) modelWarning = 'Model unavailable. Used deterministic fallback cleaning.';
cleaned = fallbackClean(cell);
}
if (!cleaned) cleaned = fallbackClean(cell);
cleanedCells.push(cleaned);
progressCurrent += 1;
}
const newRow = cleanedCells;
outRows.push(newRow);
}
cleanedValue = normalizeTable({ headers: outHeaders, data: outRows });
// Keep a CSV copy in cleanedData for export convenience
const sep = ',';
const lines = [outHeaders.join(sep), ...outRows.map(r => r.join(sep))];
cleanedData = lines.join('\n');
// Note for transparency
cleaningSteps = [{ step: `Cleaned all ${numCols} columns and replaced values in the preview.`, accepted: true }];
showSteps = true;
cleaningInProgress = false;
setTimeout(() => {
const resultsSection = document.querySelector('.results-card');
if (resultsSection) {
resultsSection.scrollIntoView({
behavior: 'smooth',
block: 'start',
inline: 'nearest'
});
}
}, 100);
}
function toggleStep(idx: number) {
cleaningSteps[idx].accepted = !cleaningSteps[idx].accepted;
}
function exportCleaned() {
// Export cleanedValue as CSV
if (!cleanedValue.headers.length) return;
const sep = ',';
const lines = [cleanedValue.headers.join(sep), ...cleanedValue.data.map(row => row.join(sep))];
const csv = lines.join('\n');
const blob = new Blob([csv], { type: 'text/csv' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'cleaned_data.csv';
a.click();
URL.revokeObjectURL(url);
}
$: updateInputValueFromRaw();
// No-op reactive blocks here
</script>
<div class="app-container df-theme">
<header class="app-header">
<div class="header-content">
<h1>Tabular Data Cleaner</h1>
<p class="subtitle">Clean and transform your spreadsheet data with AI assistance</p>
</div>
</header>
<main class="app-main">
<div class="content-grid">
<!-- Input Section -->
<section class="input-card">
<div class="card-header">
<h2>Import Data</h2>
<p class="card-subtitle">Upload a file or paste your CSV/TSV data</p>
</div>
<div class="upload-area">
<input type="file" accept=".csv,.tsv,.txt" bind:this={fileInput} on:change={handleFileUpload} id="file-input" class="file-input" />
<label for="file-input" class="file-label">
<div class="upload-icon">📁</div>
<span>Choose file or drag & drop</span>
</label>
</div>
<div class="divider">
<span>or</span>
</div>
<textarea
id="data-input"
bind:value={rawData}
rows="8"
class="data-textarea"
placeholder="Paste CSV or TSV data here..."
on:input={updateInputValueFromRaw}
></textarea>
{#if hasNonEmptyTable(inputValue)}
<div class="preview-section">
<h3>Data Preview</h3>
<div class="dataframe-wrapper">
<Dataframe
bind:value={inputValue}
show_search="search"
show_row_numbers={true}
show_copy_button={true}
show_fullscreen_button={true}
editable={true}
on:change={handleInputChange}
/>
</div>
</div>
{/if}
</section>
<!-- Controls Section -->
<section class="controls-card">
<div class="card-header">
<h2>Cleaning Options</h2>
</div>
<div class="control-group">
<h3>LLM Instructions</h3>
<textarea
rows="3"
bind:value={extraInstructions}
class="instructions-textarea"
placeholder="e.g., expand abbreviations, fix capitalization..."
></textarea>
</div>
<div class="control-group">
<h3>Transform Options</h3>
<div class="checkbox-group">
<label class="checkbox-label">
<input type="checkbox" bind:checked={optDedupeRows} />
<span class="checkmark"></span>
Deduplicate rows
</label>
<label class="checkbox-label">
<input type="checkbox" bind:checked={optDedupeColumns} />
<span class="checkmark"></span>
Deduplicate columns
</label>
<label class="checkbox-label">
<input type="checkbox" bind:checked={optRemoveSymbols} />
<span class="checkmark"></span>
Remove symbols/emojis
</label>
<label class="checkbox-label">
<input type="checkbox" bind:checked={optCollapseSpaces} />
<span class="checkmark"></span>
Remove extra spaces
</label>
<label class="checkbox-label">
<input type="checkbox" bind:checked={optHtmlToText} />
<span class="checkmark"></span>
Convert HTML to text
</label>
</div>
</div>
<div class="action-buttons">
<button
class="btn-primary"
on:click={analyzeAndClean}
disabled={cleaningInProgress || loadingModel || !inputValue.headers?.length}
>
{#if loadingModel}
<span class="spinner"></span>
Loading model...
{:else if cleaningInProgress}
<span class="spinner"></span>
Cleaning {progressCurrent}/{progressTotal}...
{:else}
✨ Clean Data
{/if}
</button>
<button
class="btn-secondary"
on:click={exportCleaned}
disabled={!hasNonEmptyTable(cleanedValue)}
>
💾 Export
</button>
</div>
{#if cleaningError}
<div class="alert alert-error">{cleaningError}</div>
{/if}
{#if modelWarning}
<div class="alert alert-warning">{modelWarning}</div>
{/if}
</section>
</div>
<!-- Results Section -->
{#if hasNonEmptyTable(cleanedValue)}
<section class="results-card">
<div class="card-header">
<h2>Cleaned Results</h2>
<p class="card-subtitle">Review and export your cleaned data</p>
</div>
<div class="dataframe-wrapper">
<Dataframe
bind:value={cleanedValue}
show_search="search"
show_row_numbers={true}
show_copy_button={true}
show_fullscreen_button={true}
editable={false}
/>
</div>
</section>
{/if}
</main>
</div>
<style>
/* Reset and base styles */
:global(*) {
box-sizing: border-box;
}
:global(body) {
margin: 0;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarell', sans-serif;
background: #f8fafc;
color: #1e293b;
line-height: 1.6;
}
/* App container */
.app-container {
min-height: 100vh;
}
/* Header */
.app-header {
margin: 0 auto;
padding: 2rem;
}
.header-content {
max-width: 1200px;
margin: 0 auto;
padding: 0 2rem;
text-align: center;
}
.app-header h1 {
font-size: 2.5rem;
font-weight: 700;
margin: 0 0 0.5rem 0;
background: linear-gradient(135deg, #667eea, #764ba2);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
background-clip: text;
}
.subtitle {
font-size: 1.1rem;
color: #64748b;
margin: 0;
font-weight: 400;
}
/* Main content */
.app-main {
max-width: 1200px;
margin: 0 auto;
padding: 2rem;
}
.content-grid {
display: grid;
grid-template-columns: 650px 350px;
gap: 2rem;
margin-bottom: 2rem;
justify-content: center;
}
@media (max-width: 768px) {
.content-grid {
grid-template-columns: 1fr;
grid-template-rows: auto auto;
max-width: 100%;
}
.input-card, .controls-card, .results-card {
width: 100%;
}
.control-group {
padding: 0;
}
}
/* Card styles */
.input-card, .controls-card, .results-card {
background: white;
border-radius: 16px;
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
border: 1px solid rgba(226, 232, 240, 0.8);
overflow: hidden;
}
.controls-card {
width: 350px;
height: min-content;
}
.card-header {
padding: 1.5rem 1.5rem 1rem 1.5rem;
border-bottom: 1px solid #f1f5f9;
}
.card-header h2 {
font-size: 1.25rem;
font-weight: 600;
margin: 0 0 0.25rem 0;
color: #1e293b;
}
.card-subtitle {
font-size: 0.9rem;
color: #64748b;
margin: 0;
}
/* Upload area */
.upload-area {
padding: 1.5rem;
}
.file-input {
display: none;
}
.file-label {
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
padding: 2rem;
border: 2px dashed #cbd5e1;
border-radius: 12px;
background: #f8fafc;
cursor: pointer;
transition: all 0.2s ease;
}
.file-label:hover {
border-color: #667eea;
background: #f1f5f9;
}
.upload-icon {
font-size: 2rem;
margin-bottom: 0.5rem;
}
.file-label span {
color: #475569;
font-weight: 500;
}
/* Divider */
.divider {
display: flex;
align-items: center;
margin: 0 1.5rem;
text-align: center;
}
.divider::before,
.divider::after {
content: '';
flex: 1;
height: 1px;
background: #e2e8f0;
}
.divider span {
padding: 0 1rem;
color: #64748b;
font-size: 0.875rem;
background: white;
}
/* Textareas */
.data-textarea, .instructions-textarea {
padding: 1rem;
border: 1px solid #e2e8f0;
border-radius: 8px;
font-size: 0.875rem;
resize: vertical;
transition: border-color 0.2s ease;
box-sizing: border-box;
line-height: 1.5;
min-height: 120px;
}
.data-textarea {
margin: 1.5rem;
width: calc(100% - 3rem);
font-family: 'JetBrains Mono', 'Fira Code', monospace;
}
.instructions-textarea {
width: 100%;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarell', sans-serif;
}
.data-textarea:focus, .instructions-textarea:focus {
outline: none;
border-color: #667eea;
box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1);
}
/* Control groups */
.control-group {
padding: 0 1.5rem 1.5rem 1.5rem;
}
.control-group h3 {
font-size: 1rem;
font-weight: 600;
margin: 0 0 0.75rem 0;
color: #374151;
}
/* Checkbox styles */
.checkbox-group {
display: flex;
flex-direction: column;
gap: 0.75rem;
}
.checkbox-label {
display: flex;
align-items: center;
cursor: pointer;
font-size: 0.9rem;
color: #374151;
}
.checkbox-label input[type="checkbox"] {
display: none;
}
.checkmark {
width: 18px;
height: 18px;
border: 2px solid #d1d5db;
border-radius: 4px;
margin-right: 0.75rem;
display: flex;
align-items: center;
justify-content: center;
transition: all 0.2s ease;
}
.checkbox-label input[type="checkbox"]:checked + .checkmark {
background: #667eea;
border-color: #667eea;
}
.checkbox-label input[type="checkbox"]:checked + .checkmark::after {
content: '✓';
color: white;
font-size: 12px;
font-weight: bold;
}
/* Buttons */
.action-buttons {
padding: 0 1.5rem 1.5rem 1.5rem;
display: flex;
gap: 0.75rem;
}
.btn-primary, .btn-secondary {
padding: 0.75rem 1.5rem;
border-radius: 8px;
border: none;
font-weight: 600;
cursor: pointer;
transition: all 0.2s ease;
display: flex;
align-items: center;
gap: 0.5rem;
}
.btn-primary {
background: linear-gradient(135deg, #667eea, #764ba2);
color: white;
flex: 1;
}
.btn-primary:hover:not(:disabled) {
transform: translateY(-1px);
box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1);
}
.btn-secondary {
background: #f8fafc;
color: #475569;
border: 1px solid #e2e8f0;
}
.btn-secondary:hover:not(:disabled) {
background: #f1f5f9;
}
.btn-primary:disabled, .btn-secondary:disabled {
opacity: 0.5;
cursor: not-allowed;
transform: none;
}
/* Spinner */
.spinner {
width: 16px;
height: 16px;
border: 2px solid transparent;
border-top: 2px solid currentColor;
border-radius: 50%;
animation: spin 1s linear infinite;
}
@keyframes spin {
to { transform: rotate(360deg); }
}
/* Alerts */
.alert {
margin: 0 1.5rem 1rem 1.5rem;
padding: 0.75rem 1rem;
border-radius: 8px;
font-size: 0.875rem;
}
.alert-error {
background: #fef2f2;
color: #dc2626;
border: 1px solid #fecaca;
}
.alert-warning {
background: #fffbeb;
color: #d97706;
border: 1px solid #fed7aa;
}
/* Preview section */
.preview-section {
padding: 0 1.5rem 1.5rem 1.5rem;
}
.preview-section h3 {
font-size: 1rem;
font-weight: 600;
margin: 0 0 1rem 0;
color: #374151;
}
/* Results card */
.results-card {
grid-column: 1 / -1;
}
.results-card .dataframe-wrapper {
margin: 1.5rem;
}
/* Theme overrides */
.df-theme {
--gr-df-table-text: #1e293b !important;
}
</style>