#!/usr/bin/env node /** * IKEA HuggingFace Dataset Importer * * Fetches product data from the tsazan/ikea-us-commercetxt dataset on HuggingFace * and converts items with valid dimensions into our catalog JSON format. * * Usage: * node scripts/import-ikea-hf.js [--limit N] [--output path] * * The HuggingFace dataset stores products in CommerceTXT format where each row * is a line of text. Products are spread across multiple rows with sections like * @PRODUCT, @SPECS, @IMAGES. This script streams through rows, reassembles * product records, extracts dimensions, and generates procedural box meshes. */ const DATASET = 'tsazan/ikea-us-commercetxt'; const API_BASE = 'https://datasets-server.huggingface.co'; const BATCH_SIZE = 100; // Category mapping from IKEA categories to our catalog categories const CATEGORY_MAP = { 'sofas': 'seating', 'armchairs': 'seating', 'chairs': 'seating', 'dining chairs': 'seating', 'office chairs': 'office', 'desk chairs': 'office', 'desks': 'tables', 'dining tables': 'tables', 'coffee tables': 'tables', 'side tables': 'tables', 'console tables': 'tables', 'nightstands': 'tables', 'bedside tables': 'tables', 'bookcases': 'storage', 'shelving units': 'storage', 'shelf units': 'storage', 'dressers': 'storage', 'chests of drawers': 'storage', 'wardrobes': 'storage', 'tv stands': 'storage', 'tv benches': 'storage', 'sideboards': 'storage', 'cabinets': 'storage', 'beds': 'beds', 'bed frames': 'beds', 'kitchen cabinets': 'kitchen', 'kitchen islands': 'kitchen', 'base cabinets': 'kitchen', 'wall cabinets': 'kitchen', }; // Room mapping based on category const ROOM_MAP = { 'seating': ['wohnzimmer'], 'tables': ['wohnzimmer', 'esszimmer'], 'storage': ['wohnzimmer', 'arbeitszimmer'], 'beds': ['schlafzimmer'], 'kitchen': ['kueche'], 'office': ['arbeitszimmer'], }; // Parse dimension string like '23⅝"' or '50¾"' to meters function parseInchDim(str) { if (!str) return null; str = str.trim().replace(/"/g, '').replace(/'/g, ''); // Handle fractions like ⅝, ¾, ½, ¼, ⅜, ⅞ const fractions = { '⅛': 0.125, '¼': 0.25, '⅜': 0.375, '½': 0.5, '⅝': 0.625, '¾': 0.75, '⅞': 0.875 }; let value = 0; for (const [frac, num] of Object.entries(fractions)) { if (str.includes(frac)) { str = str.replace(frac, ''); value += num; } } const numPart = parseFloat(str); if (!isNaN(numPart)) value += numPart; // Convert inches to meters return value > 0 ? Math.round(value * 0.0254 * 1000) / 1000 : null; } // Parse a dimensions line from @SPECS section // Examples: "Width: 23⅝" and 50¾".", "Height: 29½"", "Depth: 15⅜"" function parseDimensions(specsLines) { let width = null, height = null, depth = null; for (const line of specsLines) { const lower = line.toLowerCase(); // Try "Width: X" pattern const wMatch = line.match(/Width:\s*([^,.\n]+)/i); if (wMatch) { // Take first value if multiple ("23⅝" and 50¾"") const parts = wMatch[1].split(/\s+and\s+/); width = parseInchDim(parts[parts.length - 1]); // take largest } const hMatch = line.match(/Height:\s*([^,.\n]+)/i); if (hMatch) { const parts = hMatch[1].split(/\s+and\s+/); height = parseInchDim(parts[parts.length - 1]); } const dMatch = line.match(/Depth:\s*([^,.\n]+)/i); if (dMatch) { const parts = dMatch[1].split(/\s+and\s+/); depth = parseInchDim(parts[parts.length - 1]); } // Also try "WxDxH" or "W"xD"xH"" pattern const xMatch = line.match(/(\d+[⅛¼⅜½⅝¾⅞]?)"?\s*x\s*(\d+[⅛¼⅜½⅝¾⅞]?)"?\s*x\s*(\d+[⅛¼⅜½⅝¾⅞]?)"/i); if (xMatch) { width = width || parseInchDim(xMatch[1]); depth = depth || parseInchDim(xMatch[2]); height = height || parseInchDim(xMatch[3]); } } if (width && height && depth) { return { width, depth, height }; } // At minimum need width and one other if (width && (height || depth)) { return { width, depth: depth || Math.round(width * 0.5 * 1000) / 1000, height: height || Math.round(width * 0.8 * 1000) / 1000 }; } return null; } // Generate a simple procedural box mesh from dimensions function generateMesh(dims, category) { const { width, depth, height } = dims; const color = { seating: '#7a8a9a', tables: '#b09870', storage: '#f0ece4', beds: '#f5f0eb', kitchen: '#e0dcd4', office: '#cccccc', }[category] || '#aaaaaa'; return { type: 'group', parts: [ { name: 'body', geometry: 'box', size: [width, height, depth], position: [0, height / 2, 0], color } ] }; } // Generate slug ID from product name function slugify(name) { return 'ikea-hf-' + name .toLowerCase() .replace(/[äöü]/g, c => ({ 'ä': 'ae', 'ö': 'oe', 'ü': 'ue' }[c])) .replace(/[^a-z0-9]+/g, '-') .replace(/(^-|-$)/g, '') .slice(0, 50); } // Guess category from product name/context function guessCategory(name, contextCategory) { const lower = name.toLowerCase(); if (/sofa|couch|loveseat/i.test(lower)) return 'seating'; if (/chair|armchair|stool/i.test(lower)) return 'seating'; if (/desk|table/i.test(lower)) return 'tables'; if (/shelf|bookcase|shelving|kallax|billy/i.test(lower)) return 'storage'; if (/dresser|drawer|wardrobe|pax|malm.*drawer/i.test(lower)) return 'storage'; if (/tv.*bench|tv.*stand|besta|bestå/i.test(lower)) return 'storage'; if (/bed|mattress/i.test(lower)) return 'beds'; if (/cabinet|kitchen|metod|knoxhult/i.test(lower)) return 'kitchen'; if (/office/i.test(lower)) return 'office'; // Try context category for (const [key, cat] of Object.entries(CATEGORY_MAP)) { if (contextCategory && contextCategory.toLowerCase().includes(key)) return cat; } return 'storage'; // default } // Extract IKEA series name from product name function extractSeries(name) { // IKEA series are typically the first all-caps word const match = name.match(/^([A-ZÅÄÖ]{2,})/); return match ? match[1] : null; } async function fetchRows(offset, length) { const url = `${API_BASE}/rows?dataset=${DATASET}&config=default&split=train&offset=${offset}&length=${length}`; const resp = await fetch(url); if (!resp.ok) throw new Error(`API error: ${resp.status}`); const data = await resp.json(); return data.rows?.map(r => r.row?.text || '') || []; } async function importDataset(maxItems = 50) { console.error(`Fetching IKEA products from HuggingFace (limit: ${maxItems})...`); const items = []; const seenIds = new Set(); let offset = 0; let currentProduct = null; let currentSection = null; let currentCategory = null; let specsLines = []; let totalRows = 0; // Process in batches while (items.length < maxItems) { let rows; try { rows = await fetchRows(offset, BATCH_SIZE); } catch (e) { console.error(` Fetch error at offset ${offset}: ${e.message}`); break; } if (!rows || rows.length === 0) break; totalRows += rows.length; for (const line of rows) { // Track sections if (line.startsWith('# @CATEGORY')) { currentSection = 'category'; continue; } if (line.startsWith('# @PRODUCT')) { currentSection = 'product'; currentProduct = {}; specsLines = []; continue; } if (line.startsWith('# @SPECS')) { currentSection = 'specs'; continue; } if (line.startsWith('# @FILTERS')) { currentSection = 'filters'; continue; } if (line.startsWith('# @ITEMS')) { currentSection = 'items'; continue; } if (line.startsWith('# @IMAGES')) { currentSection = 'images'; continue; } if (line === '---' || line.startsWith('# DISCLAIMER')) { // End of product — process if we have one if (currentProduct && currentProduct.name) { const dims = parseDimensions(specsLines); if (dims && dims.width > 0.1 && dims.height > 0.1) { const category = guessCategory(currentProduct.name, currentCategory); const id = slugify(currentProduct.name); if (!seenIds.has(id)) { seenIds.add(id); items.push({ id, name: currentProduct.name, ikeaSeries: extractSeries(currentProduct.name), sku: currentProduct.sku || null, category, rooms: ROOM_MAP[category] || [], dimensions: dims, mesh: generateMesh(dims, category) }); if (items.length >= maxItems) break; } } } currentProduct = null; currentSection = line.startsWith('# DISCLAIMER') ? 'disclaimer' : null; specsLines = []; continue; } // Parse line content based on section if (currentSection === 'category') { const nameMatch = line.match(/^Name:\s*(.+)/); if (nameMatch) currentCategory = nameMatch[1].trim(); } if (currentSection === 'product' && currentProduct) { const nameMatch = line.match(/^Name:\s*(.+)/); if (nameMatch) currentProduct.name = nameMatch[1].trim(); const skuMatch = line.match(/^SKU:\s*(.+)/); if (skuMatch) currentProduct.sku = skuMatch[1].trim(); } if (currentSection === 'specs') { if (line.trim()) specsLines.push(line); } } if (items.length >= maxItems) break; offset += BATCH_SIZE; // Safety limit: don't scan more than 100k rows if (offset > 100000) { console.error(` Reached scan limit at ${offset} rows`); break; } } console.error(` Scanned ${totalRows} rows, extracted ${items.length} items with dimensions`); return items; } async function main() { const args = process.argv.slice(2); let limit = 100; let outputPath = null; for (let i = 0; i < args.length; i++) { if (args[i] === '--limit' && args[i + 1]) limit = parseInt(args[i + 1], 10); if (args[i] === '--output' && args[i + 1]) outputPath = args[i + 1]; } const items = await importDataset(limit); const catalog = { version: '1.0', source: 'huggingface-ikea-us-commercetxt', units: 'meters', description: `Imported from HuggingFace dataset tsazan/ikea-us-commercetxt (${items.length} items)`, categories: [...new Set(items.map(i => i.category))].sort(), items }; const json = JSON.stringify(catalog, null, 2); if (outputPath) { const fs = await import('fs'); fs.writeFileSync(outputPath, json); console.error(`Wrote ${items.length} items to ${outputPath}`); } else { process.stdout.write(json); } } main().catch(e => { console.error('Error:', e.message); process.exit(1); });