HoruxDespachosNuevo/apps/api/src/services/sat/sat-csf-parser.ts

import { PDFParse } from 'pdf-parse';

export interface Domicilio {
  codigoPostal?: string;
  tipoVialidad?: string;
  nombreVialidad?: string;
  numeroExterior?: string;
  numeroInterior?: string;
  colonia?: string;
  localidad?: string;
  municipio?: string;
  entidadFederativa?: string;
  entreCalle?: string;
  yCalle?: string;
}

export interface ActividadEconomica {
  orden: number;
  descripcion: string;
  porcentaje: number;
  fechaInicio: string;
  fechaFin?: string;
}

export interface RegimenCsf {
  nombre: string;
  fechaInicio: string;
  fechaFin?: string;
}

export interface Obligacion {
  descripcion: string;
  descripcionVencimiento: string;
  fechaInicio: string;
  fechaFin?: string;
}

export interface ConstanciaSituacionFiscal {
  rfc: string;
  curp?: string;
  idCIF: string;
  nombre?: string;
  primerApellido?: string;
  segundoApellido?: string;
  razonSocial?: string;
  nombreComercial?: string;
  fechaInicioOperaciones: string;
  estatusPadron: string;
  fechaUltimoCambioEstado?: string;
  lugarFechaEmision: string;
  domicilio: Domicilio;
  actividadesEconomicas: ActividadEconomica[];
  regimenes: RegimenCsf[];
  obligaciones: Obligacion[];
  cadenaOriginalSello: string;
  selloDigital: string;
}

async function extractPdfText(pdfBuffer: Buffer): Promise<string> {
  const parser = new PDFParse({ data: pdfBuffer });
  try {
    const result = await parser.getText();
    return result.text;
  } finally {
    await parser.destroy();
  }
}

const LABELS = [
  'RFC', 'CURP', 'Nombre (s)', 'Primer Apellido', 'Segundo Apellido',
  'Denominación o Razón Social', 'Denominación/Razón Social',
  'Régimen Capital', 'Fecha inicio de operaciones', 'Estatus en el padrón',
  'Fecha de último cambio de estado', 'Nombre Comercial',
  'Código Postal', 'Tipo de Vialidad', 'Nombre de Vialidad',
  'Número Exterior', 'Número Interior', 'Nombre de la Colonia',
  'Nombre de la Localidad', 'Nombre del Municipio o Demarcación Territorial',
  'Nombre de la Entidad Federativa', 'Entre Calle', 'Y Calle',
] as const;

function escapeRegex(s: string): string {
  return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}

function extractLabels(text: string): Map<string, string> {
  const result = new Map<string, string>();
  const labelAlternation = LABELS.map(escapeRegex).join('|');
  const re = new RegExp(
    `(${labelAlternation})\\s*:\\s*([\\s\\S]*?)(?=\\s*(?:${labelAlternation})\\s*:|\\n?\\s*(?:Datos del domicilio registrado|Actividades Económicas|Regímenes|Obligaciones|Cadena Original|Sus datos personales)\\b|\\n\\s*--\\s*\\d+\\s+of\\s+\\d+|$)`,
    'g',
  );
  for (const match of text.matchAll(re)) {
    const label = match[1];
    let value = match[2].replace(/\s+/g, ' ').trim();

    // Defensa: el SAT a veces pone etiquetas consecutivas sin valor intermedio
    // (ej. "Número Interior:\nNombre de la Colonia: X"). El regex lazy captura
    // de más y el valor termina incluyendo el nombre de la siguiente etiqueta.
    // Limpiamos cualquier prefijo de otra etiqueta del SAT que haya quedado al
    // inicio del valor.
    for (const otherLabel of LABELS) {
      if (otherLabel === label) continue;
      const prefix = otherLabel + ':';
      const lowerValue = value.toLowerCase();
      const lowerPrefix = prefix.toLowerCase();
      if (lowerValue.startsWith(lowerPrefix)) {
        value = value.slice(prefix.length).trim();
        break;
      }
    }

    if (!result.has(label)) result.set(label, value);
  }
  return result;
}

function extractIdCIF(text: string): string {
  const m = text.match(/idCIF\s*:?\s*(\d+)/i);
  if (!m) throw new Error('idCIF no encontrado en PDF');
  return m[1];
}

function extractLugarFechaEmision(text: string): string {
  const m = text.match(/Lugar y Fecha de Emisión\s*\n?\s*([^\n]+?)\s*(?=\n|TORC|HTS|[A-Z]{4}\d{6})/);
  if (m) return m[1].replace(/\s+/g, ' ').trim();
  const m2 = text.match(/([A-ZÁÉÍÓÚÑ ]+,\s*[A-ZÁÉÍÓÚÑ ]+\s+A\s+\d{1,2}\s+DE\s+[A-ZÁÉÍÓÚÑ]+\s+DE\s+\d{4})/i);
  if (m2) return m2[1].replace(/\s+/g, ' ').trim();
  throw new Error('Lugar y Fecha de Emisión no encontrado');
}

const PAGE_NOISE_RE = /^\s*(?:--\s*\d+\s+of\s+\d+\s*--|Página\s*\[\d+\]\s*de\s*\[\d+\])\s*$/;

function sliceSection(text: string, header: string, nextHeaders: string[]): string {
  const start = text.indexOf(header);
  if (start === -1) return '';
  const after = start + header.length;
  let end = text.length;
  for (const h of nextHeaders) {
    const idx = text.indexOf(h, after);
    if (idx !== -1 && idx < end) end = idx;
  }
  return text.slice(after, end);
}

function groupRowChunks(body: string, headerRowRegex: RegExp): string[] {
  const lines = body.split(/\r?\n/).map(l => l.trim()).filter(l => l.length > 0 && !PAGE_NOISE_RE.test(l));
  if (lines.length > 0 && headerRowRegex.test(lines[0])) lines.shift();
  const chunks: string[] = [];
  let current: string[] = [];
  for (const line of lines) {
    current.push(line);
    if (/\d{2}\/\d{2}\/\d{4}\s*$/.test(line)) {
      chunks.push(current.join(' ').replace(/\s+/g, ' ').trim());
      current = [];
    }
  }
  return chunks;
}

function extractActividades(text: string): ActividadEconomica[] {
  const section = sliceSection(text, 'Actividades Económicas:', ['Regímenes:', 'Obligaciones:', 'Cadena Original']);
  if (!section) return [];
  const chunks = groupRowChunks(section, /^\s*Orden\s+Actividad\s+Económica\s+Porcentaje\s+Fecha\s+Inicio\s+Fecha\s+Fin\s*$/i);
  const result: ActividadEconomica[] = [];
  for (const chunk of chunks) {
    const m = chunk.match(/^(\d+)\s+(.+?)\s+(\d+)\s+(\d{2}\/\d{2}\/\d{4})(?:\s+(\d{2}\/\d{2}\/\d{4}))?$/);
    if (!m) continue;
    result.push({
      orden: Number(m[1]),
      descripcion: m[2].replace(/\s+/g, ' ').trim(),
      porcentaje: Number(m[3]),
      fechaInicio: m[4],
      fechaFin: m[5],
    });
  }
  return result;
}

function extractRegimenes(text: string): RegimenCsf[] {
  const section = sliceSection(text, 'Regímenes:', ['Obligaciones:', 'Cadena Original']);
  if (!section) return [];
  const chunks = groupRowChunks(section, /^\s*Régimen\s+Fecha\s+Inicio\s+Fecha\s+Fin\s*$/i);
  const result: RegimenCsf[] = [];
  for (const chunk of chunks) {
    const m = chunk.match(/^(.+?)\s+(\d{2}\/\d{2}\/\d{4})(?:\s+(\d{2}\/\d{2}\/\d{4}))?$/);
    if (!m) continue;
    result.push({ nombre: m[1].replace(/\s+/g, ' ').trim(), fechaInicio: m[2], fechaFin: m[3] });
  }
  return result;
}

function extractObligaciones(text: string): Obligacion[] {
  const section = sliceSection(text, 'Obligaciones:', ['Sus datos personales', 'Cadena Original']);
  if (!section) return [];
  const chunks = groupRowChunks(section, /^\s*Descripción de la Obligación\s+Descripción Vencimiento\s+Fecha Inicio\s+Fecha Fin\s*$/i);
  const result: Obligacion[] = [];
  for (const chunk of chunks) {
    const m = chunk.match(/^(.+?)\s+((?:A\s+m[aá]s\s+tardar|Dentro\s+de|Mensualmente|Bimestralmente|Trimestralmente|Anualmente|En\s+los|Cuando\s+)[\s\S]+?)\s+(\d{2}\/\d{2}\/\d{4})(?:\s+(\d{2}\/\d{2}\/\d{4}))?$/);
    if (!m) continue;
    result.push({ descripcion: m[1].trim(), descripcionVencimiento: m[2].trim(), fechaInicio: m[3], fechaFin: m[4] });
  }
  return result;
}

function extractCadenaOriginalSello(text: string): string {
  const m = text.match(/Cadena Original Sello\s*:\s*(\|\|[\s\S]+?\|\|)\s*(?:Sello Digital|$)/);
  if (!m) throw new Error('Cadena Original Sello no encontrada');
  return m[1].replace(/\s+/g, '');
}

function extractSelloDigital(text: string): string {
  const m = text.match(/Sello Digital\s*:\s*([A-Za-z0-9+/=\s]+?)(?:\n\s*\n|Página|$)/);
  if (!m) throw new Error('Sello Digital no encontrado');
  return m[1].replace(/\s+/g, '');
}

export async function parseCsfPdf(pdfBuffer: Buffer): Promise<ConstanciaSituacionFiscal> {
  const text = await extractPdfText(pdfBuffer);
  const labels = extractLabels(text);
  const idCIF = extractIdCIF(text);
  const lugarFechaEmision = extractLugarFechaEmision(text);

  const rfc = labels.get('RFC');
  if (!rfc) throw new Error('RFC no encontrado en PDF');

  const fechaInicioOperaciones = labels.get('Fecha inicio de operaciones');
  if (!fechaInicioOperaciones) throw new Error('Fecha inicio de operaciones no encontrada');

  const estatusPadron = labels.get('Estatus en el padrón');
  if (!estatusPadron) throw new Error('Estatus en el padrón no encontrado');

  return {
    rfc,
    curp: labels.get('CURP'),
    idCIF,
    nombre: labels.get('Nombre (s)'),
    primerApellido: labels.get('Primer Apellido'),
    segundoApellido: labels.get('Segundo Apellido'),
    razonSocial: labels.get('Denominación o Razón Social') ?? labels.get('Denominación/Razón Social'),
    nombreComercial: labels.get('Nombre Comercial') || undefined,
    fechaInicioOperaciones,
    estatusPadron,
    fechaUltimoCambioEstado: labels.get('Fecha de último cambio de estado'),
    lugarFechaEmision,
    domicilio: {
      codigoPostal: labels.get('Código Postal'),
      tipoVialidad: labels.get('Tipo de Vialidad'),
      nombreVialidad: labels.get('Nombre de Vialidad'),
      numeroExterior: labels.get('Número Exterior'),
      numeroInterior: labels.get('Número Interior'),
      colonia: labels.get('Nombre de la Colonia'),
      localidad: labels.get('Nombre de la Localidad'),
      municipio: labels.get('Nombre del Municipio o Demarcación Territorial'),
      entidadFederativa: labels.get('Nombre de la Entidad Federativa'),
      entreCalle: labels.get('Entre Calle'),
      yCalle: labels.get('Y Calle'),
    },
    actividadesEconomicas: extractActividades(text),
    regimenes: extractRegimenes(text),
    obligaciones: extractObligaciones(text),
    cadenaOriginalSello: extractCadenaOriginalSello(text),
    selloDigital: extractSelloDigital(text),
  };
}