2022-11-10 16:31:41 +00:00
|
|
|
|
import { readFileSync, writeFileSync } from 'fs'
|
2022-12-08 17:32:37 +00:00
|
|
|
|
import { join } from 'path'
|
2022-11-10 16:31:41 +00:00
|
|
|
|
import { PdfData, VerbosityLevel } from 'pdfdataextract'
|
2022-12-08 17:32:37 +00:00
|
|
|
|
import { fileURLToPath } from 'url'
|
2022-11-10 16:31:41 +00:00
|
|
|
|
|
2022-12-08 17:32:37 +00:00
|
|
|
|
import { customTags } from './custom-tags.js'
|
2022-11-10 16:31:41 +00:00
|
|
|
|
|
2022-12-08 17:32:37 +00:00
|
|
|
|
const __dirname = fileURLToPath(new URL('.', import.meta.url))
|
2022-11-10 16:31:41 +00:00
|
|
|
|
|
2022-12-08 17:32:37 +00:00
|
|
|
|
const INPUT_PDF_PATH = join(
|
|
|
|
|
__dirname,
|
|
|
|
|
'./Nomenclatures_NAF_et_CPF_Reedition_2020.pdf'
|
|
|
|
|
)
|
|
|
|
|
const OUTPUT_JSON_PATH = join(__dirname, './output.json')
|
|
|
|
|
const OUTPUT_ORIGINAL_TEXT_PATH = join(__dirname, './output.txt')
|
|
|
|
|
|
|
|
|
|
const fileData = readFileSync(INPUT_PDF_PATH)
|
2022-11-10 16:31:41 +00:00
|
|
|
|
|
|
|
|
|
void PdfData.extract(fileData, {
|
2022-12-08 17:32:37 +00:00
|
|
|
|
// how many pages should be read at most
|
|
|
|
|
// pages: 6,
|
|
|
|
|
// sort the text by text coordinates
|
|
|
|
|
sort: true,
|
|
|
|
|
// set the verbosity level for parsing
|
|
|
|
|
verbosity: VerbosityLevel.ERRORS,
|
2022-11-10 16:31:41 +00:00
|
|
|
|
}).then(({ text }) => {
|
2022-12-08 17:32:37 +00:00
|
|
|
|
writeFileSync(
|
|
|
|
|
OUTPUT_ORIGINAL_TEXT_PATH,
|
|
|
|
|
text?.join('\n\n[[[NEW PAGE]]]\n\n') ?? ''
|
|
|
|
|
)
|
|
|
|
|
writeFileSync(OUTPUT_JSON_PATH, JSON.stringify(transformText(text), null, 2))
|
2022-11-10 16:31:41 +00:00
|
|
|
|
})
|
|
|
|
|
|
2023-01-19 17:47:14 +00:00
|
|
|
|
const specialCase = new RegExp(
|
|
|
|
|
[
|
|
|
|
|
// remove space
|
|
|
|
|
'fi lm',
|
|
|
|
|
'fi nal',
|
|
|
|
|
'iff érent',
|
|
|
|
|
'spécifi que',
|
|
|
|
|
'infi rmeries',
|
|
|
|
|
'artifi ciellement',
|
|
|
|
|
|
|
|
|
|
// keep -
|
|
|
|
|
'sports-|amateurs-',
|
|
|
|
|
|
|
|
|
|
// remove \n and -
|
|
|
|
|
'(divi)\\s*-\n(sions)',
|
|
|
|
|
'(termi)\\s*-\n(naux)',
|
|
|
|
|
'(enseigne)\\s*-\n(ment)',
|
|
|
|
|
|
|
|
|
|
// remove space and \n
|
|
|
|
|
'(est-)\\s*\n(à)',
|
|
|
|
|
|
|
|
|
|
// remove \n
|
|
|
|
|
'(de\\s+)\n(DVD)',
|
|
|
|
|
'(en\\s+)\n(France)',
|
|
|
|
|
'(DVD\\s+)\n(\\(cf.)',
|
|
|
|
|
'(entre\\s+)\n(400)',
|
|
|
|
|
'(niveau\\s*)\n(2)',
|
|
|
|
|
'(groupe\\s*)\n(22.2.)',
|
|
|
|
|
'(métier\\s+)\n(Rachel)',
|
|
|
|
|
'(ciments\\s+)\n(Portland)',
|
|
|
|
|
'(divisions\\s+)\n(05, 07 et 08)',
|
|
|
|
|
|
|
|
|
|
// remove \n
|
|
|
|
|
'(\\w+\\s+)\n(\\d+\\s+%)',
|
|
|
|
|
'([a-zA-Z]+-)\n([a-z]+)',
|
|
|
|
|
|
|
|
|
|
// remove space after -
|
|
|
|
|
"([0-9a-z\u00E0-\u00FCA-Z\u00C0-\u00DC]{3,}-)\\s+([0-9a-z\u00E0-\u00FCA-Z\u00C0-\u00DC'Œ]{3,})",
|
|
|
|
|
|
|
|
|
|
// remove \n after parentis open and close next line
|
|
|
|
|
"(\\([0-9a-z\u00E0-\u00FCA-Z\u00C0-\u00DC'., -]+)\n([0-9a-z\u00E0-\u00FCA-Z\u00C0-\u00DC'., -]+\\))",
|
|
|
|
|
].join('|'),
|
|
|
|
|
'g'
|
|
|
|
|
)
|
|
|
|
|
|
2022-11-10 16:31:41 +00:00
|
|
|
|
const transformText = (pages: PdfData['text']) => {
|
|
|
|
|
if (!pages) {
|
|
|
|
|
throw new Error('No text found')
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const normalize = (str: string) =>
|
|
|
|
|
str.replace(/\s+/g, ' ').replace(/\s+;/g, ';')
|
|
|
|
|
|
|
|
|
|
const previous: {
|
|
|
|
|
index: {
|
|
|
|
|
section: number | null
|
|
|
|
|
division: number | null
|
|
|
|
|
groupe: number | null
|
|
|
|
|
classe: number | null
|
|
|
|
|
sousClasse: number | null
|
|
|
|
|
catégorie: number | null
|
|
|
|
|
sousCatégorie: number | null
|
|
|
|
|
}
|
|
|
|
|
contenuType: 'contenuCentral' | 'contenuAnnexe' | 'contenuExclu' | null
|
|
|
|
|
} = {
|
|
|
|
|
index: {
|
|
|
|
|
section: null,
|
|
|
|
|
division: null,
|
|
|
|
|
groupe: null,
|
|
|
|
|
classe: null,
|
|
|
|
|
sousClasse: null,
|
|
|
|
|
catégorie: null,
|
|
|
|
|
sousCatégorie: null,
|
|
|
|
|
},
|
|
|
|
|
contenuType: null,
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-15 19:27:24 +00:00
|
|
|
|
const data = pages.reduce((arr, page, pageIndex) => {
|
2022-11-10 16:31:41 +00:00
|
|
|
|
const passPage = pageIndex === 0 || page.match(/Table des matières/)
|
|
|
|
|
if (passPage) {
|
|
|
|
|
return arr
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const lines = page
|
2023-01-19 17:47:14 +00:00
|
|
|
|
.replace(/\s+[21]{4,}/g, '')
|
|
|
|
|
.replace(
|
|
|
|
|
/([A-Z\u00C0-\u00DC()';]{2,}[\s;,-]+)\n([A-Z\u00C0-\u00DC()';]{2,})|([^A-Z])\s\n([^A-Z0-9\s•-])/g,
|
|
|
|
|
'$1$3 $2$4'
|
|
|
|
|
)
|
|
|
|
|
.replace(
|
|
|
|
|
/-\s+séchées,\s+cuites,\s+etc\.\)la/g,
|
|
|
|
|
' séchées, cuites, etc.)\n- la'
|
|
|
|
|
)
|
|
|
|
|
.replace(
|
|
|
|
|
specialCase,
|
|
|
|
|
(removeSpace, ...rest) =>
|
|
|
|
|
rest.slice(0, -2).join('') || removeSpace.replace(/\s+/, '')
|
|
|
|
|
)
|
2022-11-10 16:31:41 +00:00
|
|
|
|
.split('\n')
|
|
|
|
|
|
|
|
|
|
for (let j = 0; j < lines.length; j++) {
|
2022-12-08 17:32:37 +00:00
|
|
|
|
// eslint-disable-next-line no-console
|
2022-12-15 20:49:08 +00:00
|
|
|
|
console.log('###', `>${lines[j]}<`)
|
2022-11-10 16:31:41 +00:00
|
|
|
|
|
|
|
|
|
const line = lines[j].trim()
|
|
|
|
|
const passLine = line.match(
|
2022-12-15 19:27:24 +00:00
|
|
|
|
/Nomenclature d’Activités Française NAF|(Classification|Classifi cation) des Produits Française CPF/
|
2022-11-10 16:31:41 +00:00
|
|
|
|
)
|
|
|
|
|
if (line === '' || passLine) {
|
2022-12-08 17:32:37 +00:00
|
|
|
|
// eslint-disable-next-line no-console
|
2022-11-10 16:31:41 +00:00
|
|
|
|
console.log('[pass]')
|
|
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const section = line.match(/^Section ([A-Z]) :\s+([^\n]+)$/)
|
2023-01-19 17:47:14 +00:00
|
|
|
|
const division = line.match(/^(\d{2})\s+([A-Z\u00C0-\u00DC][^\n]+)$/)
|
2022-11-10 16:31:41 +00:00
|
|
|
|
const groupe = line.match(/^(\d{2}\.\d)\s+([^\n]+)$/)
|
|
|
|
|
const classe = line.match(/^(\d{2}\.\d{2})\s+([^\n]+)$/)
|
|
|
|
|
const sousClasse = line.match(/^(\d{2}\.\d{2}[A-Z])\s+([^\n]+)$/)
|
|
|
|
|
|
|
|
|
|
const catégorie = line.match(/^(\d{2}\.\d{2}\.\d)\s+([^\n]+)$/)
|
|
|
|
|
const sousCatégorie = line.match(/^(\d{2}\.\d{2}\.\d{2})\s+([^\n]+)$/)
|
|
|
|
|
|
2022-12-15 19:27:24 +00:00
|
|
|
|
const contenuCentral = line.match(/^(CC)\s+:[•-\s]*([^\n]+)$/)
|
|
|
|
|
const contenuAnnexe = line.match(/^(CA)\s+:[•-\s]*([^\n]+)$/)
|
|
|
|
|
const contenuExclu = line.match(/^(NC)\s+:[•-\s]*([^\n]+)$/)
|
2022-11-10 16:31:41 +00:00
|
|
|
|
|
2023-01-19 17:47:14 +00:00
|
|
|
|
const comprend = line.match(
|
|
|
|
|
/^(?:Ce|Cette)\s+[^\s]+\s+(?:comprend|couvre)\s+(?::|([^\n]+))$/
|
|
|
|
|
)
|
|
|
|
|
const comprendAussi = line.match(
|
|
|
|
|
/^(?:Ce|Cette)\s+[^\s]+\s+(?:comprend|couvre)\s+aussi\s+(?::|([^\n]+))$/
|
|
|
|
|
)
|
|
|
|
|
const comprendPas = line.match(
|
|
|
|
|
/^(?:Ce|Cette)\s+[^\s]+\s+ne\s+(?:comprend|couvre)\s+pas\s+(?::|([^\n]+))$/
|
|
|
|
|
)
|
|
|
|
|
const produitsAssociés = line.match(
|
|
|
|
|
/^(?:Produits associés :\s+(.*)|[0-9p,. ]+)$/
|
|
|
|
|
)
|
2022-11-10 16:31:41 +00:00
|
|
|
|
|
2023-01-19 17:47:14 +00:00
|
|
|
|
const parExemple = line.match(/^Par exemple :$/)
|
|
|
|
|
const item = line.match(/^(•|-|\d+\))\s+(.+)$/)
|
2022-11-10 16:31:41 +00:00
|
|
|
|
|
|
|
|
|
// - CC : contenu central. Représente une part importante de l'activité de l’un ou de l’autre poste.
|
|
|
|
|
// - CA : contenu annexe. Représente une part accessoire de l'activité des deux postes.
|
|
|
|
|
// - NC : contenu exclu. Utile quand les deux postes sont identiques à une composante près facilement identifiée.
|
|
|
|
|
// Les ‘NC’ d’un lien correspondent généralement à des ‘CA’ de liens impliquant au moins l’un des deux postes concernés.
|
|
|
|
|
|
|
|
|
|
const DEBUG_DATA = false as boolean
|
|
|
|
|
|
|
|
|
|
const previousElement = arr[arr.length - 1]
|
|
|
|
|
|
|
|
|
|
if (
|
|
|
|
|
!section &&
|
|
|
|
|
!division &&
|
|
|
|
|
!groupe &&
|
|
|
|
|
!classe &&
|
|
|
|
|
!catégorie &&
|
|
|
|
|
!sousCatégorie &&
|
|
|
|
|
!sousClasse &&
|
|
|
|
|
!contenuCentral &&
|
|
|
|
|
!contenuAnnexe &&
|
|
|
|
|
!contenuExclu &&
|
|
|
|
|
!comprend &&
|
|
|
|
|
!comprendAussi &&
|
|
|
|
|
!comprendPas &&
|
2023-01-19 17:47:14 +00:00
|
|
|
|
!parExemple &&
|
2022-11-10 16:31:41 +00:00
|
|
|
|
(!item || (item && !previous.contenuType))
|
|
|
|
|
) {
|
|
|
|
|
if (previous.index.section === null) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
previousElement.data.push(
|
|
|
|
|
normalize(line),
|
|
|
|
|
...(DEBUG_DATA === true ? [line] : [])
|
|
|
|
|
)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (
|
|
|
|
|
!section &&
|
|
|
|
|
!division &&
|
|
|
|
|
!groupe &&
|
|
|
|
|
!classe &&
|
|
|
|
|
!catégorie &&
|
|
|
|
|
!sousCatégorie &&
|
|
|
|
|
!sousClasse &&
|
|
|
|
|
!contenuCentral &&
|
|
|
|
|
!contenuAnnexe &&
|
|
|
|
|
!contenuExclu &&
|
|
|
|
|
!comprend &&
|
|
|
|
|
!comprendAussi &&
|
|
|
|
|
!comprendPas &&
|
|
|
|
|
!produitsAssociés &&
|
|
|
|
|
!item &&
|
|
|
|
|
previous.contenuType
|
|
|
|
|
) {
|
|
|
|
|
previousElement[previous.contenuType].push(
|
2022-12-15 19:27:24 +00:00
|
|
|
|
normalize(line),
|
2022-11-10 16:31:41 +00:00
|
|
|
|
...(DEBUG_DATA === true ? [line] : [])
|
|
|
|
|
)
|
2023-01-19 17:47:14 +00:00
|
|
|
|
} else if (!item && !produitsAssociés) {
|
2022-11-10 16:31:41 +00:00
|
|
|
|
previous.contenuType = null
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (item && previous.contenuType) {
|
|
|
|
|
if (!(previous.contenuType in previousElement)) {
|
|
|
|
|
throw new Error(
|
|
|
|
|
`In page n°${pageIndex + 1}, no ${previous.contenuType} in ${
|
|
|
|
|
previousElement.type
|
|
|
|
|
}`
|
|
|
|
|
)
|
|
|
|
|
}
|
|
|
|
|
previousElement[previous.contenuType].push(
|
|
|
|
|
(item[1] === '•' ? '• ' : '') + normalize(item[2]),
|
|
|
|
|
...(DEBUG_DATA === true ? [line] : [])
|
|
|
|
|
)
|
|
|
|
|
}
|
|
|
|
|
|
2023-01-19 17:47:14 +00:00
|
|
|
|
if (comprend && !comprendAussi && !comprendPas) {
|
|
|
|
|
if (comprend[1]) {
|
|
|
|
|
previousElement.contenuCentral.push(
|
|
|
|
|
normalize(comprend[1]),
|
|
|
|
|
...(DEBUG_DATA === true ? [comprend[1]] : [])
|
|
|
|
|
)
|
|
|
|
|
}
|
2022-11-10 16:31:41 +00:00
|
|
|
|
previous.contenuType = 'contenuCentral'
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if (comprendAussi) {
|
2023-01-19 17:47:14 +00:00
|
|
|
|
if (comprendAussi[1]) {
|
|
|
|
|
previousElement.contenuAnnexe.push(
|
|
|
|
|
normalize(comprendAussi[1]),
|
|
|
|
|
...(DEBUG_DATA === true ? [comprendAussi[1]] : [])
|
|
|
|
|
)
|
|
|
|
|
}
|
2022-11-10 16:31:41 +00:00
|
|
|
|
previous.contenuType = 'contenuAnnexe'
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if (comprendPas) {
|
2023-01-19 17:47:14 +00:00
|
|
|
|
if (comprendPas[1]) {
|
|
|
|
|
previousElement.contenuExclu.push(
|
|
|
|
|
normalize(comprendPas[1]),
|
|
|
|
|
...(DEBUG_DATA === true ? [comprendPas[1]] : [])
|
|
|
|
|
)
|
|
|
|
|
}
|
2022-11-10 16:31:41 +00:00
|
|
|
|
previous.contenuType = 'contenuExclu'
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const parseContenu = (
|
|
|
|
|
type: 'contenuCentral' | 'contenuAnnexe' | 'contenuExclu',
|
|
|
|
|
match: RegExpMatchArray | null,
|
|
|
|
|
previousElement: Data,
|
|
|
|
|
pageIndex: number
|
|
|
|
|
) => {
|
|
|
|
|
if (!match) {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
if (!(type in previousElement)) {
|
|
|
|
|
throw new Error(
|
|
|
|
|
`In page n°${pageIndex + 1}, no ${type} in ${previousElement.type}`
|
|
|
|
|
)
|
|
|
|
|
}
|
|
|
|
|
previousElement[type].push(
|
|
|
|
|
normalize(match[2]),
|
|
|
|
|
...(DEBUG_DATA === true ? [line] : [])
|
|
|
|
|
)
|
|
|
|
|
previous.contenuType = type
|
|
|
|
|
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-12-08 17:32:37 +00:00
|
|
|
|
|
2022-11-10 16:31:41 +00:00
|
|
|
|
parseContenu('contenuCentral', contenuCentral, previousElement, pageIndex)
|
|
|
|
|
parseContenu('contenuAnnexe', contenuAnnexe, previousElement, pageIndex)
|
|
|
|
|
parseContenu('contenuExclu', contenuExclu, previousElement, pageIndex)
|
|
|
|
|
|
|
|
|
|
const parseNomenclatures = (
|
|
|
|
|
type: Data['type'],
|
|
|
|
|
parentType: Data['type'] | null,
|
|
|
|
|
match: RegExpMatchArray | null
|
|
|
|
|
) => {
|
|
|
|
|
if (!match) {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (parentType && previous.index[parentType] === null) {
|
2022-12-08 17:32:37 +00:00
|
|
|
|
// eslint-disable-next-line no-console
|
2022-11-10 16:31:41 +00:00
|
|
|
|
console.error('\n\n\n' + line)
|
|
|
|
|
|
|
|
|
|
throw new Error(`no ${parentType} found`)
|
|
|
|
|
}
|
|
|
|
|
previous.index[type] = arr.length
|
|
|
|
|
|
2022-12-08 17:32:37 +00:00
|
|
|
|
// eslint-disable-next-line no-console
|
2022-11-10 16:31:41 +00:00
|
|
|
|
console.log(`[${type}]:`, match[1], '-', normalize(match[2]))
|
|
|
|
|
|
|
|
|
|
const code = match[1]
|
|
|
|
|
|
|
|
|
|
arr.push({
|
|
|
|
|
type,
|
|
|
|
|
code,
|
|
|
|
|
title: normalize(match[2]),
|
|
|
|
|
data: [],
|
2022-12-15 19:27:24 +00:00
|
|
|
|
contenuCentral: [],
|
|
|
|
|
contenuAnnexe: [],
|
|
|
|
|
contenuExclu: [],
|
2022-11-10 16:31:41 +00:00
|
|
|
|
parent: (parentType && previous.index[parentType]) ?? undefined,
|
|
|
|
|
})
|
|
|
|
|
}
|
2022-12-08 17:32:37 +00:00
|
|
|
|
|
2022-11-10 16:31:41 +00:00
|
|
|
|
parseNomenclatures('section', null, section)
|
|
|
|
|
parseNomenclatures('division', 'section', division)
|
|
|
|
|
parseNomenclatures('groupe', 'division', groupe)
|
|
|
|
|
parseNomenclatures('classe', 'groupe', classe)
|
|
|
|
|
parseNomenclatures('sousClasse', 'classe', sousClasse)
|
|
|
|
|
parseNomenclatures('catégorie', 'sousClasse', catégorie)
|
|
|
|
|
parseNomenclatures('sousCatégorie', 'catégorie', sousCatégorie)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return arr
|
|
|
|
|
}, [] as Data[])
|
2022-12-15 19:27:24 +00:00
|
|
|
|
|
|
|
|
|
return data.map((data) => {
|
|
|
|
|
const {
|
|
|
|
|
contenuCentral = [],
|
|
|
|
|
contenuAnnexe = [],
|
|
|
|
|
contenuExclu = [],
|
|
|
|
|
} = customTags[data.code] ?? {}
|
|
|
|
|
|
|
|
|
|
data.contenuCentral.push(...contenuCentral)
|
|
|
|
|
data.contenuAnnexe.push(...contenuAnnexe)
|
|
|
|
|
data.contenuExclu.push(...contenuExclu)
|
|
|
|
|
|
|
|
|
|
return data
|
|
|
|
|
})
|
2022-11-10 16:31:41 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
interface CommonData<Type extends string> {
|
|
|
|
|
type: Type
|
|
|
|
|
code: string
|
|
|
|
|
title: string
|
|
|
|
|
data: string[]
|
|
|
|
|
contenuCentral: string[]
|
|
|
|
|
contenuAnnexe: string[]
|
|
|
|
|
contenuExclu: string[]
|
|
|
|
|
parent?: number
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type Section = CommonData<'section'>
|
|
|
|
|
type Division = CommonData<'division'>
|
|
|
|
|
type Groupe = CommonData<'groupe'>
|
|
|
|
|
type Classe = CommonData<'classe'>
|
|
|
|
|
type SousClasse = CommonData<'sousClasse'>
|
|
|
|
|
type Catégorie = CommonData<'catégorie'>
|
|
|
|
|
type SousCatégorie = CommonData<'sousCatégorie'>
|
|
|
|
|
|
2022-12-08 17:32:37 +00:00
|
|
|
|
export type Data =
|
2022-11-10 16:31:41 +00:00
|
|
|
|
| Section
|
|
|
|
|
| Division
|
|
|
|
|
| Groupe
|
|
|
|
|
| Classe
|
|
|
|
|
| SousClasse
|
|
|
|
|
| Catégorie
|
|
|
|
|
| SousCatégorie
|