- Create data/ikea-catalog.json with 41 curated IKEA items across 23 series (KALLAX, BILLY, MALM, PAX, HEMNES, LACK, etc.) with verified dimensions - Add source tabs (All/Standard/IKEA) to catalog panel for filtering - Add IKEA series filter bar when viewing IKEA items - Add IKEA badge and series label on item cards - Add mergeCatalog() to renderer for loading additional catalog files - Add scripts/import-ikea-hf.js for importing from HuggingFace dataset
362 lines
11 KiB
JavaScript
362 lines
11 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* IKEA HuggingFace Dataset Importer
|
|
*
|
|
* Fetches product data from the tsazan/ikea-us-commercetxt dataset on HuggingFace
|
|
* and converts items with valid dimensions into our catalog JSON format.
|
|
*
|
|
* Usage:
|
|
* node scripts/import-ikea-hf.js [--limit N] [--output path]
|
|
*
|
|
* The HuggingFace dataset stores products in CommerceTXT format where each row
|
|
* is a line of text. Products are spread across multiple rows with sections like
|
|
* @PRODUCT, @SPECS, @IMAGES. This script streams through rows, reassembles
|
|
* product records, extracts dimensions, and generates procedural box meshes.
|
|
*/
|
|
|
|
const DATASET = 'tsazan/ikea-us-commercetxt';
|
|
const API_BASE = 'https://datasets-server.huggingface.co';
|
|
const BATCH_SIZE = 100;
|
|
|
|
// Category mapping from IKEA categories to our catalog categories
|
|
const CATEGORY_MAP = {
|
|
'sofas': 'seating',
|
|
'armchairs': 'seating',
|
|
'chairs': 'seating',
|
|
'dining chairs': 'seating',
|
|
'office chairs': 'office',
|
|
'desk chairs': 'office',
|
|
'desks': 'tables',
|
|
'dining tables': 'tables',
|
|
'coffee tables': 'tables',
|
|
'side tables': 'tables',
|
|
'console tables': 'tables',
|
|
'nightstands': 'tables',
|
|
'bedside tables': 'tables',
|
|
'bookcases': 'storage',
|
|
'shelving units': 'storage',
|
|
'shelf units': 'storage',
|
|
'dressers': 'storage',
|
|
'chests of drawers': 'storage',
|
|
'wardrobes': 'storage',
|
|
'tv stands': 'storage',
|
|
'tv benches': 'storage',
|
|
'sideboards': 'storage',
|
|
'cabinets': 'storage',
|
|
'beds': 'beds',
|
|
'bed frames': 'beds',
|
|
'kitchen cabinets': 'kitchen',
|
|
'kitchen islands': 'kitchen',
|
|
'base cabinets': 'kitchen',
|
|
'wall cabinets': 'kitchen',
|
|
};
|
|
|
|
// Room mapping based on category
|
|
const ROOM_MAP = {
|
|
'seating': ['wohnzimmer'],
|
|
'tables': ['wohnzimmer', 'esszimmer'],
|
|
'storage': ['wohnzimmer', 'arbeitszimmer'],
|
|
'beds': ['schlafzimmer'],
|
|
'kitchen': ['kueche'],
|
|
'office': ['arbeitszimmer'],
|
|
};
|
|
|
|
// Parse dimension string like '23⅝"' or '50¾"' to meters
|
|
function parseInchDim(str) {
|
|
if (!str) return null;
|
|
str = str.trim().replace(/"/g, '').replace(/'/g, '');
|
|
|
|
// Handle fractions like ⅝, ¾, ½, ¼, ⅜, ⅞
|
|
const fractions = { '⅛': 0.125, '¼': 0.25, '⅜': 0.375, '½': 0.5, '⅝': 0.625, '¾': 0.75, '⅞': 0.875 };
|
|
let value = 0;
|
|
|
|
for (const [frac, num] of Object.entries(fractions)) {
|
|
if (str.includes(frac)) {
|
|
str = str.replace(frac, '');
|
|
value += num;
|
|
}
|
|
}
|
|
|
|
const numPart = parseFloat(str);
|
|
if (!isNaN(numPart)) value += numPart;
|
|
|
|
// Convert inches to meters
|
|
return value > 0 ? Math.round(value * 0.0254 * 1000) / 1000 : null;
|
|
}
|
|
|
|
// Parse a dimensions line from @SPECS section
|
|
// Examples: "Width: 23⅝" and 50¾".", "Height: 29½"", "Depth: 15⅜""
|
|
function parseDimensions(specsLines) {
|
|
let width = null, height = null, depth = null;
|
|
|
|
for (const line of specsLines) {
|
|
const lower = line.toLowerCase();
|
|
|
|
// Try "Width: X" pattern
|
|
const wMatch = line.match(/Width:\s*([^,.\n]+)/i);
|
|
if (wMatch) {
|
|
// Take first value if multiple ("23⅝" and 50¾"")
|
|
const parts = wMatch[1].split(/\s+and\s+/);
|
|
width = parseInchDim(parts[parts.length - 1]); // take largest
|
|
}
|
|
|
|
const hMatch = line.match(/Height:\s*([^,.\n]+)/i);
|
|
if (hMatch) {
|
|
const parts = hMatch[1].split(/\s+and\s+/);
|
|
height = parseInchDim(parts[parts.length - 1]);
|
|
}
|
|
|
|
const dMatch = line.match(/Depth:\s*([^,.\n]+)/i);
|
|
if (dMatch) {
|
|
const parts = dMatch[1].split(/\s+and\s+/);
|
|
depth = parseInchDim(parts[parts.length - 1]);
|
|
}
|
|
|
|
// Also try "WxDxH" or "W"xD"xH"" pattern
|
|
const xMatch = line.match(/(\d+[⅛¼⅜½⅝¾⅞]?)"?\s*x\s*(\d+[⅛¼⅜½⅝¾⅞]?)"?\s*x\s*(\d+[⅛¼⅜½⅝¾⅞]?)"/i);
|
|
if (xMatch) {
|
|
width = width || parseInchDim(xMatch[1]);
|
|
depth = depth || parseInchDim(xMatch[2]);
|
|
height = height || parseInchDim(xMatch[3]);
|
|
}
|
|
}
|
|
|
|
if (width && height && depth) {
|
|
return { width, depth, height };
|
|
}
|
|
// At minimum need width and one other
|
|
if (width && (height || depth)) {
|
|
return {
|
|
width,
|
|
depth: depth || Math.round(width * 0.5 * 1000) / 1000,
|
|
height: height || Math.round(width * 0.8 * 1000) / 1000
|
|
};
|
|
}
|
|
return null;
|
|
}
|
|
|
|
// Generate a simple procedural box mesh from dimensions
|
|
function generateMesh(dims, category) {
|
|
const { width, depth, height } = dims;
|
|
const color = {
|
|
seating: '#7a8a9a',
|
|
tables: '#b09870',
|
|
storage: '#f0ece4',
|
|
beds: '#f5f0eb',
|
|
kitchen: '#e0dcd4',
|
|
office: '#cccccc',
|
|
}[category] || '#aaaaaa';
|
|
|
|
return {
|
|
type: 'group',
|
|
parts: [
|
|
{
|
|
name: 'body',
|
|
geometry: 'box',
|
|
size: [width, height, depth],
|
|
position: [0, height / 2, 0],
|
|
color
|
|
}
|
|
]
|
|
};
|
|
}
|
|
|
|
// Generate slug ID from product name
|
|
function slugify(name) {
|
|
return 'ikea-hf-' + name
|
|
.toLowerCase()
|
|
.replace(/[äöü]/g, c => ({ 'ä': 'ae', 'ö': 'oe', 'ü': 'ue' }[c]))
|
|
.replace(/[^a-z0-9]+/g, '-')
|
|
.replace(/(^-|-$)/g, '')
|
|
.slice(0, 50);
|
|
}
|
|
|
|
// Guess category from product name/context
|
|
function guessCategory(name, contextCategory) {
|
|
const lower = name.toLowerCase();
|
|
if (/sofa|couch|loveseat/i.test(lower)) return 'seating';
|
|
if (/chair|armchair|stool/i.test(lower)) return 'seating';
|
|
if (/desk|table/i.test(lower)) return 'tables';
|
|
if (/shelf|bookcase|shelving|kallax|billy/i.test(lower)) return 'storage';
|
|
if (/dresser|drawer|wardrobe|pax|malm.*drawer/i.test(lower)) return 'storage';
|
|
if (/tv.*bench|tv.*stand|besta|bestå/i.test(lower)) return 'storage';
|
|
if (/bed|mattress/i.test(lower)) return 'beds';
|
|
if (/cabinet|kitchen|metod|knoxhult/i.test(lower)) return 'kitchen';
|
|
if (/office/i.test(lower)) return 'office';
|
|
|
|
// Try context category
|
|
for (const [key, cat] of Object.entries(CATEGORY_MAP)) {
|
|
if (contextCategory && contextCategory.toLowerCase().includes(key)) return cat;
|
|
}
|
|
|
|
return 'storage'; // default
|
|
}
|
|
|
|
// Extract IKEA series name from product name
|
|
function extractSeries(name) {
|
|
// IKEA series are typically the first all-caps word
|
|
const match = name.match(/^([A-ZÅÄÖ]{2,})/);
|
|
return match ? match[1] : null;
|
|
}
|
|
|
|
async function fetchRows(offset, length) {
|
|
const url = `${API_BASE}/rows?dataset=${DATASET}&config=default&split=train&offset=${offset}&length=${length}`;
|
|
const resp = await fetch(url);
|
|
if (!resp.ok) throw new Error(`API error: ${resp.status}`);
|
|
const data = await resp.json();
|
|
return data.rows?.map(r => r.row?.text || '') || [];
|
|
}
|
|
|
|
async function importDataset(maxItems = 50) {
|
|
console.error(`Fetching IKEA products from HuggingFace (limit: ${maxItems})...`);
|
|
|
|
const items = [];
|
|
const seenIds = new Set();
|
|
let offset = 0;
|
|
let currentProduct = null;
|
|
let currentSection = null;
|
|
let currentCategory = null;
|
|
let specsLines = [];
|
|
let totalRows = 0;
|
|
|
|
// Process in batches
|
|
while (items.length < maxItems) {
|
|
let rows;
|
|
try {
|
|
rows = await fetchRows(offset, BATCH_SIZE);
|
|
} catch (e) {
|
|
console.error(` Fetch error at offset ${offset}: ${e.message}`);
|
|
break;
|
|
}
|
|
|
|
if (!rows || rows.length === 0) break;
|
|
totalRows += rows.length;
|
|
|
|
for (const line of rows) {
|
|
// Track sections
|
|
if (line.startsWith('# @CATEGORY')) {
|
|
currentSection = 'category';
|
|
continue;
|
|
}
|
|
if (line.startsWith('# @PRODUCT')) {
|
|
currentSection = 'product';
|
|
currentProduct = {};
|
|
specsLines = [];
|
|
continue;
|
|
}
|
|
if (line.startsWith('# @SPECS')) {
|
|
currentSection = 'specs';
|
|
continue;
|
|
}
|
|
if (line.startsWith('# @FILTERS')) {
|
|
currentSection = 'filters';
|
|
continue;
|
|
}
|
|
if (line.startsWith('# @ITEMS')) {
|
|
currentSection = 'items';
|
|
continue;
|
|
}
|
|
if (line.startsWith('# @IMAGES')) {
|
|
currentSection = 'images';
|
|
continue;
|
|
}
|
|
if (line === '---' || line.startsWith('# DISCLAIMER')) {
|
|
// End of product — process if we have one
|
|
if (currentProduct && currentProduct.name) {
|
|
const dims = parseDimensions(specsLines);
|
|
if (dims && dims.width > 0.1 && dims.height > 0.1) {
|
|
const category = guessCategory(currentProduct.name, currentCategory);
|
|
const id = slugify(currentProduct.name);
|
|
|
|
if (!seenIds.has(id)) {
|
|
seenIds.add(id);
|
|
items.push({
|
|
id,
|
|
name: currentProduct.name,
|
|
ikeaSeries: extractSeries(currentProduct.name),
|
|
sku: currentProduct.sku || null,
|
|
category,
|
|
rooms: ROOM_MAP[category] || [],
|
|
dimensions: dims,
|
|
mesh: generateMesh(dims, category)
|
|
});
|
|
|
|
if (items.length >= maxItems) break;
|
|
}
|
|
}
|
|
}
|
|
currentProduct = null;
|
|
currentSection = line.startsWith('# DISCLAIMER') ? 'disclaimer' : null;
|
|
specsLines = [];
|
|
continue;
|
|
}
|
|
|
|
// Parse line content based on section
|
|
if (currentSection === 'category') {
|
|
const nameMatch = line.match(/^Name:\s*(.+)/);
|
|
if (nameMatch) currentCategory = nameMatch[1].trim();
|
|
}
|
|
|
|
if (currentSection === 'product' && currentProduct) {
|
|
const nameMatch = line.match(/^Name:\s*(.+)/);
|
|
if (nameMatch) currentProduct.name = nameMatch[1].trim();
|
|
const skuMatch = line.match(/^SKU:\s*(.+)/);
|
|
if (skuMatch) currentProduct.sku = skuMatch[1].trim();
|
|
}
|
|
|
|
if (currentSection === 'specs') {
|
|
if (line.trim()) specsLines.push(line);
|
|
}
|
|
}
|
|
|
|
if (items.length >= maxItems) break;
|
|
offset += BATCH_SIZE;
|
|
|
|
// Safety limit: don't scan more than 100k rows
|
|
if (offset > 100000) {
|
|
console.error(` Reached scan limit at ${offset} rows`);
|
|
break;
|
|
}
|
|
}
|
|
|
|
console.error(` Scanned ${totalRows} rows, extracted ${items.length} items with dimensions`);
|
|
return items;
|
|
}
|
|
|
|
async function main() {
|
|
const args = process.argv.slice(2);
|
|
let limit = 100;
|
|
let outputPath = null;
|
|
|
|
for (let i = 0; i < args.length; i++) {
|
|
if (args[i] === '--limit' && args[i + 1]) limit = parseInt(args[i + 1], 10);
|
|
if (args[i] === '--output' && args[i + 1]) outputPath = args[i + 1];
|
|
}
|
|
|
|
const items = await importDataset(limit);
|
|
|
|
const catalog = {
|
|
version: '1.0',
|
|
source: 'huggingface-ikea-us-commercetxt',
|
|
units: 'meters',
|
|
description: `Imported from HuggingFace dataset tsazan/ikea-us-commercetxt (${items.length} items)`,
|
|
categories: [...new Set(items.map(i => i.category))].sort(),
|
|
items
|
|
};
|
|
|
|
const json = JSON.stringify(catalog, null, 2);
|
|
|
|
if (outputPath) {
|
|
const fs = await import('fs');
|
|
fs.writeFileSync(outputPath, json);
|
|
console.error(`Wrote ${items.length} items to ${outputPath}`);
|
|
} else {
|
|
process.stdout.write(json);
|
|
}
|
|
}
|
|
|
|
main().catch(e => {
|
|
console.error('Error:', e.message);
|
|
process.exit(1);
|
|
});
|