Spaces:
Running
Running
| import express from 'express'; | |
| import { chromium } from 'playwright'; | |
| import cors from 'cors'; | |
| import bodyParser from 'body-parser'; | |
| const app = express(); | |
| const PORT = process.env.PORT || 7860; | |
| // Middleware aplikasi | |
| app.set('json spaces', 2); | |
| app.use(bodyParser.urlencoded({ extended: true })); | |
| app.use(bodyParser.json()); | |
| app.use(express.json({ limit: '500mb' })); | |
| app.use(cors()); | |
| // Konstanta konfigurasi | |
| const DEFAULT_TIMEOUT = 30000; | |
| const MAX_CONCURRENT_PAGES = 3; | |
| const CONTENT_SELECTORS = [ | |
| 'main', 'article', '[role="main"]', '.content', '#content', | |
| '.post-content', '.entry-content', '.article-content', '.page-content', | |
| '.main-content', '[itemprop="articleBody"]', '.story-body', '.article-body', | |
| '.detail__body-text', '.detail__body', '.itp_bodycontent' | |
| ]; | |
| // Konfigurasi pemblokiran sumber daya | |
| const BLOCKED_RESOURCES = ['font', 'media', 'websocket', 'image']; | |
| const BLOCKED_PATTERNS = [ | |
| 'google-analytics', 'doubleclick', 'facebook', 'twitter', | |
| 'analytics', 'ads', 'tracking', 'pixel' | |
| ]; | |
| /** | |
| * Mengatur routing halaman untuk memblokir sumber daya yang tidak perlu | |
| * Meningkatkan kecepatan loading dengan memblokir gambar, font, media, dan tracker | |
| * @param {import('playwright').Page} page - Instance halaman Playwright | |
| */ | |
| async function setupOptimizedRouting(page) { | |
| await page.route('**/*', (route) => { | |
| const resourceType = route.request().resourceType(); | |
| const requestUrl = route.request().url(); | |
| const isBlockedResource = BLOCKED_RESOURCES.includes(resourceType); | |
| const isBlockedPattern = BLOCKED_PATTERNS.some(pattern => | |
| requestUrl.toLowerCase().includes(pattern) | |
| ); | |
| if (isBlockedResource || isBlockedPattern) { | |
| route.abort(); | |
| } else { | |
| route.continue(); | |
| } | |
| }); | |
| } | |
| /** | |
| * Melakukan scroll halaman untuk memuat konten dinamis | |
| * Mensimulasikan perilaku scroll pengguna untuk mengaktifkan lazy loading | |
| * @param {import('playwright').Page} page - Instance halaman Playwright | |
| */ | |
| async function optimizedScroll(page) { | |
| await page.evaluate(() => { | |
| return new Promise((resolve) => { | |
| const distance = 300; | |
| const maxScrolls = 20; | |
| const delay = 50; | |
| let currentScroll = 0; | |
| const timer = setInterval(() => { | |
| const scrollHeight = document.body.scrollHeight; | |
| window.scrollBy(0, distance); | |
| currentScroll += distance; | |
| if (currentScroll >= scrollHeight || currentScroll >= distance * maxScrolls) { | |
| clearInterval(timer); | |
| window.scrollTo(0, 0); | |
| resolve(); | |
| } | |
| }, delay); | |
| }); | |
| }); | |
| } | |
| /** | |
| * Mengekstrak konten utama dari URL yang diberikan menggunakan instance halaman Playwright | |
| * Menangani navigasi, scroll otomatis, dan ekstraksi konten secara inteligent | |
| * @param {string} url - URL yang akan diekstrak kontennya | |
| * @param {import('playwright').Page} page - Instance halaman Playwright | |
| * @returns {Promise<Object>} - Objek hasil ekstraksi konten | |
| */ | |
| async function extractContentFromUrl(url, page) { | |
| try { | |
| page.setDefaultNavigationTimeout(DEFAULT_TIMEOUT); | |
| page.setDefaultTimeout(DEFAULT_TIMEOUT); | |
| await setupOptimizedRouting(page); | |
| // Navigasi ke halaman target | |
| try { | |
| await page.goto(url, { | |
| waitUntil: 'domcontentloaded', | |
| timeout: DEFAULT_TIMEOUT | |
| }); | |
| } catch (navigationError) { | |
| await page.goto(url, { | |
| waitUntil: 'load', | |
| timeout: DEFAULT_TIMEOUT | |
| }); | |
| } | |
| // Tunggu elemen body dengan timeout yang lebih singkat | |
| try { | |
| await page.waitForSelector('body', { state: 'visible', timeout: 5000 }); | |
| } catch (e) {} | |
| // Tunggu halaman termuat | |
| await page.waitForTimeout(1500); | |
| // Tunggu konten utama muncul | |
| await Promise.race([ | |
| page.waitForSelector(CONTENT_SELECTORS[0], { timeout: 3000 }), | |
| page.waitForTimeout(2000) | |
| ]); | |
| // Lakukan scrolling halaman | |
| await optimizedScroll(page); | |
| await page.waitForTimeout(500); | |
| // Ekstraksi konten halaman | |
| const content = await page.evaluate((selectors) => { | |
| const cleanText = (text) => text ? text.replace(/\s+/g, ' ').trim() : ''; | |
| const getTextContent = (element) => { | |
| if (!element) return ''; | |
| const clone = element.cloneNode(true); | |
| clone.querySelectorAll('script, style, noscript, iframe, nav, header, footer, aside').forEach(el => el.remove()); | |
| return cleanText(clone.textContent || clone.innerText || ''); | |
| }; | |
| const title = document.title || ''; | |
| const metaDescription = document.querySelector('meta[name="description"]')?.content || | |
| document.querySelector('meta[property="og:description"]')?.content || ''; | |
| const headings = { | |
| h1: Array.from(document.querySelectorAll('h1')).map(h1 => getTextContent(h1)).filter(Boolean).slice(0, 5), | |
| h2: Array.from(document.querySelectorAll('h2')).map(h2 => getTextContent(h2)).filter(Boolean).slice(0, 10) | |
| }; | |
| const paragraphs = Array.from(document.querySelectorAll('p')) | |
| .map(p => getTextContent(p)) | |
| .filter(text => text.length > 20) | |
| .slice(0, 50); | |
| // Cari elemen konten utama | |
| let mainContent = null; | |
| for (const selector of selectors) { | |
| const element = document.querySelector(selector); | |
| if (element && getTextContent(element).length > 100) { | |
| mainContent = element; | |
| break; | |
| } | |
| } | |
| // Strategi cadangan jika konten utama tidak ditemukan | |
| if (!mainContent) { | |
| const contentCandidates = Array.from(document.querySelectorAll('div, section, article')) | |
| .map(el => ({ element: el, text: getTextContent(el) })) | |
| .filter(candidate => candidate.text.length > 200) | |
| .sort((a, b) => b.text.length - a.text.length); | |
| mainContent = contentCandidates[0]?.element || document.body; | |
| } | |
| const mainText = getTextContent(mainContent); | |
| const allText = mainText || paragraphs.slice(0, 20).join(' ') || document.body.innerText || ''; | |
| const wordCount = allText.split(/\s+/).filter(word => word.length > 0).length; | |
| return { | |
| title: title.trim(), | |
| metaDescription: metaDescription.trim(), | |
| headings, | |
| paragraphs: paragraphs.slice(0, 20), | |
| mainText: mainText.slice(0, 10000), | |
| wordCount, | |
| hasContent: wordCount > 50, | |
| url: window.location.href | |
| }; | |
| }, CONTENT_SELECTORS); | |
| return { | |
| url, | |
| success: true, | |
| content, | |
| extractedAt: new Date().toISOString() | |
| }; | |
| } catch (error) { | |
| return { | |
| url, | |
| success: false, | |
| error: error.message, | |
| extractedAt: new Date().toISOString() | |
| }; | |
| } | |
| } | |
| // Route API | |
| /** | |
| * Memvalidasi dan memfilter URL untuk memastikan format yang benar | |
| * Fungsi ini memisahkan URL valid dan tidak valid dari array input | |
| * @param {string[]} urls - Array URL yang akan divalidasi | |
| * @returns {Object} - Object yang berisi URL valid dan tidak valid | |
| */ | |
| function validateUrls(urls) { | |
| const validUrls = []; | |
| const invalidUrls = []; | |
| urls.forEach(url => { | |
| try { | |
| new URL(url); | |
| validUrls.push(url); | |
| } catch (error) { | |
| invalidUrls.push(url); | |
| } | |
| }); | |
| return { validUrls, invalidUrls }; | |
| } | |
| /** | |
| * Memproses multiple URL secara konkuren dengan kontrol paralelisme yang terbatas | |
| * Fungsi ini menggunakan semaphore untuk membatasi jumlah tab yang berjalan bersamaan | |
| * @param {string[]} urls - Array URL yang akan diproses | |
| * @param {import('playwright').Browser} browser - Instance browser Playwright | |
| * @returns {Promise<Object[]>} - Array hasil ekstraksi dari semua URL | |
| */ | |
| async function processUrlsConcurrently(urls, browser) { | |
| const contextOptions = { | |
| userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', | |
| viewport: { width: 1920, height: 1080 }, | |
| locale: 'en-US', | |
| timezoneId: 'America/New_York', | |
| ignoreHTTPSErrors: true | |
| }; | |
| const results = []; | |
| const semaphore = new Array(MAX_CONCURRENT_PAGES).fill(null); | |
| for (let i = 0; i < urls.length; i++) { | |
| const url = urls[i]; | |
| // Tunggu slot tersedia | |
| while (semaphore.filter(s => s === null).length === 0) { | |
| await Promise.race(semaphore.filter(Boolean)); | |
| } | |
| // Cari slot kosong | |
| const slotIndex = semaphore.findIndex(s => s === null); | |
| // Mulai proses ekstraksi | |
| const extractionPromise = (async () => { | |
| const context = await browser.newContext(contextOptions); | |
| const page = await context.newPage(); | |
| try { | |
| const result = await extractContentFromUrl(url, page); | |
| return result; | |
| } catch (error) { | |
| return { | |
| url, | |
| success: false, | |
| error: error.message, | |
| extractedAt: new Date().toISOString() | |
| }; | |
| } finally { | |
| await context.close(); | |
| } | |
| })(); | |
| semaphore[slotIndex] = extractionPromise | |
| .then(result => { | |
| results[slotIndex] = result; | |
| semaphore[slotIndex] = null; | |
| }) | |
| .catch(() => { | |
| results[slotIndex] = { | |
| url, | |
| success: false, | |
| error: 'Ekstraksi gagal', | |
| extractedAt: new Date().toISOString() | |
| }; | |
| semaphore[slotIndex] = null; | |
| }); | |
| } | |
| // Tunggu semua proses ekstraksi selesai | |
| await Promise.all(semaphore.filter(Boolean)); | |
| return results.filter(Boolean); | |
| } | |
| /** | |
| * Endpoint utama untuk mengekstrak konten dari multiple URL | |
| * Menerima array URL dalam request body dan mengembalikan konten yang diekstrak | |
| */ | |
| app.post('/extract-content', async (req, res) => { | |
| const { urls } = req.body; | |
| // Validasi input | |
| if (!urls || !Array.isArray(urls) || urls.length === 0) { | |
| return res.status(400).json({ | |
| success: false, | |
| message: 'Body harus berisi array urls yang tidak kosong.' | |
| }); | |
| } | |
| if (urls.length > 10) { | |
| return res.status(400).json({ | |
| success: false, | |
| message: 'Maksimal 10 URLs per request.' | |
| }); | |
| } | |
| const { validUrls, invalidUrls } = validateUrls(urls); | |
| if (invalidUrls.length > 0) { | |
| return res.status(400).json({ | |
| success: false, | |
| message: 'Format URL tidak valid.', | |
| invalidUrls | |
| }); | |
| } | |
| let browser; | |
| try { | |
| browser = await chromium.launch({ | |
| args: [ | |
| '--no-sandbox', | |
| '--disable-setuid-sandbox', | |
| '--disable-dev-shm-usage', | |
| '--disable-accelerated-2d-canvas', | |
| '--disable-gpu', | |
| '--disable-blink-features=AutomationControlled', | |
| '--disable-web-security', | |
| '--disable-features=IsolateOrigins,site-per-process', | |
| '--disable-background-timer-throttling', | |
| '--disable-backgrounding-occluded-windows', | |
| '--disable-renderer-backgrounding' | |
| ], | |
| executablePath: process.env.CHROME_BIN, | |
| headless: true, | |
| }); | |
| // Proses URL secara konkuren dengan kontrol paralelisme | |
| const results = await processUrlsConcurrently(validUrls, browser); | |
| const successCount = results.filter(r => r.success).length; | |
| const failCount = results.filter(r => !r.success).length; | |
| const emptyContentCount = results.filter(r => r.success && (!r.content.hasContent || r.content.wordCount < 50)).length; | |
| res.json({ | |
| success: true, | |
| message: `Berhasil memproses ${validUrls.length} URLs.`, | |
| statistics: { | |
| total: validUrls.length, | |
| success: successCount, | |
| failed: failCount, | |
| emptyContent: emptyContentCount | |
| }, | |
| results | |
| }); | |
| } catch (error) { | |
| res.status(500).json({ | |
| success: false, | |
| message: 'Terjadi kesalahan saat memproses URLs.', | |
| error: error.message | |
| }); | |
| } finally { | |
| if (browser) await browser.close(); | |
| } | |
| }); | |
| /** | |
| * Endpoint informasi API yang menampilkan detail tentang Content Extractor | |
| * Memberikan informasi endpoint yang tersedia dan cara penggunaan | |
| */ | |
| app.get('/', (req, res) => { | |
| res.json({ | |
| success: true, | |
| message: "API Ekstrak Konten", | |
| hostname: "https://" + req.hostname, | |
| endpoints: { | |
| 'POST /extract-content': 'Ekstrak konten dari URL', | |
| 'GET /': 'Informasi API' | |
| }, | |
| }); | |
| }); | |
| // Middleware penanganan error global | |
| /** | |
| * Menangani error yang terjadi pada aplikasi secara global | |
| * Mengembalikan response error yang konsisten dengan informasi detail di mode development | |
| */ | |
| app.use((err, req, res, next) => { | |
| res.status(500).json({ | |
| success: false, | |
| message: 'Terjadi kesalahan internal server.', | |
| error: err.message | |
| }); | |
| }); | |
| // Handler untuk endpoint yang tidak ditemukan (404) | |
| /** | |
| * Menangani request ke endpoint yang tidak ada | |
| * Mengembalikan response 404 dengan format konsisten | |
| */ | |
| app.use((req, res) => { | |
| res.status(404).json({ | |
| success: false, | |
| message: 'Endpoint tidak ditemukan.' | |
| }); | |
| }); | |
| /** | |
| * Memulai server Express pada port yang ditentukan | |
| * Menampilkan pesan saat server berhasil dijalankan | |
| */ | |
| app.listen(PORT, () => { | |
| console.log(`π API berjalan pada port ${PORT}`); | |
| }); | |