Upload and process large Excel files in perspective without massive memory usage

Hi! I´m trying to create an app that processes some Excel files uploaded by the user. I have and UploadFile component with an event that sends the file bytes (file_data = event.file.getBytes()) to the following function:

from java.io import ByteArrayInputStream
from org.apache.poi.xssf.usermodel import XSSFWorkbook
from java.util import ArrayList
from java.lang import String

def read_excel_bytes(bytes):

	stream = ByteArrayInputStream(bytes)
	
	workbook = XSSFWorkbook(stream)
	sheet = workbook.getSheetAt(0)  # Hoja 0
	
	headers = []
	headerRow = sheet.getRow(0)
	for i in range(headerRow.getLastCellNum()):
	    cell = headerRow.getCell(i)
	    headers.append(cell.getStringCellValue() if cell else "")
	
	data = []
	for rowIndex in range(1, sheet.getLastRowNum() + 1):
	    row = sheet.getRow(rowIndex)
	    fila = []
	    for colIndex in range(len(headers)):
	        cell = row.getCell(colIndex)
	        if cell is None:
	            fila.append("")
	        else:
	            tipo = cell.getCellType()
	            if tipo == "STRING":
	                fila.append(cell.getStringCellValue())
	            elif tipo == "NUMERIC":
	                fila.append(cell.getNumericCellValue())
	            elif tipo == "BOOLEAN":
	                fila.append(cell.getBooleanCellValue())
	            else:
	                fila.append(str(cell))  # fallback
	    data.append(fila)
	
	dataset = system.dataset.toDataSet(headers, data)
	
	return dataset

However, when I upload files up to 10k rows the memory usage increases highly. Does someone know an alternative and lightweight method for extracting Excel data in Perspective? Thanks!

You'll want to use the Streaming reader via OPCPackage.open().

Note that you have to accept the data as it comes in and pluck out the parts you are interested in. Potentially sorting afterward to deal with arbitrary ordering.

I haven't done this, but doesn't look like rocket science. It is clearly intended for reduced memory consumption.

It might help to use this API with a temporary file instead of opening the uploaded byte stream.

Thanks for your time, I´ve reached the following implementation creating a tempfile and reading it using OPCPackage and XSSFReader. However, the CPU usage remains quite high (altought it´s more stable than the previous approach), even though the Excel files I´m handling have at most 11k rows (about 1.4 MB size at max).

import tempfile
from java.io import ByteArrayInputStream
from org.apache.poi.openxml4j.opc import OPCPackage
from org.apache.poi.xssf.eventusermodel import XSSFReader
from org.apache.poi.xssf.model import SharedStringsTable
from org.xml.sax import InputSource
from org.xml.sax.helpers import DefaultHandler
from javax.xml.parsers import SAXParserFactory

class Handler(DefaultHandler):
    def __init__(self, sst):
        self.sst = sst
        self.header = None
        self.current_row = []
        self.text = ""
        self.is_string = False
        self.reading_value = False
        self.row_count = 0
        self.results = []

    def startElement(self, uri, localName, qName, attrs):
        if qName == "row":
            self.current_row = []
        elif qName == "c":
            self.is_string = attrs.getValue("t") == "s" if attrs.getValue("t") else False
        elif qName == "v":
            self.reading_value = True
            self.text = ""

    def characters(self, ch, start, length):
        if self.reading_value:
            self.text += "".join(ch[start:start+length])

    def endElement(self, uri, localName, qName):
        if qName == "v":
            if self.is_string:
                idx = int(self.text)
                valor = self.sst.getItemAt(idx).getString()
            else:
                valor = self.text
            self.current_row.append(valor)
            self.reading_value = False
        elif qName == "row":
            if self.row_count == 0:
                self.header = self.current_row
            else:
                fila_dict = {}
                for i, key in enumerate(self.header):
                    val = self.current_row[i] if i < len(self.current_row) else None
                    fila_dict[key] = val
                self.results.append(fila_dict)
            self.row_count += 1

def read_first_sheet_from_bytes(byte_array):

    temp_file = tempfile.NamedTemporaryFile(delete=True)
    temp_file.write(byte_array)
    temp_file.flush()

    pkg = OPCPackage.open(temp_file.name)
    results = []
    try:
        reader = XSSFReader(pkg)
        sst = reader.getSharedStringsTable()
        sheets = reader.getSheetsData()

        if sheets.hasNext():
            current_sheet = sheets.next()
            parser = SAXParserFactory.newInstance().newSAXParser().getXMLReader()
            handler = Handler(sst)
            parser.setContentHandler(handler)
            parser.parse(InputSource(current_sheet))
            current_sheet.close()
            results = handler.results
    finally:
        pkg.close()
        temp_file.close()

    return results

You are blazing a trail for the rest of us here. Thanks for reporting back.