SUB HtmlToTxt(f$,ft$)
' conversion approximative fichier html f$ en fichier texte brut dans ft$
DIM_LOCAL a$,s$,ba$,cr$,ut$,k%,k1%,ns%,csp$(30),crm$(30)
cr$ = CHR$(13)+CHR$(10): ' saut de ligne
ut$ = CHR$(195): ' préfixe UTF-8
DATA "Carspe": ' caractères spéciaux, à remplacer (à compléter éventuellement)
DATA " ","<",">","&",""","»"
data "Ã
","è","ê","ë","È","É","Ê","Ë" : ' les lettres (e / E) accentuées
data "Ã","À","æ","Æ" : ' les lettres (a / A) accentuées
data "à´","ö","Ô","Å“","Å’" : ' les lettres (o / O)
data "Ã
","ï","ÃŽ","Ã" : ' les lettres (i / I) : ' ç et Ç
data "ç","Ç"
data "F"
' caractère(s) de remplacement
DATA " ","<",">","&",CHR$(34),cr$+CHR$(187)
data "é","è","ê","ë","È","Ê","É","Ë"
data "à","À","æ","Æ"
data "ô","ö","Ô","œ","Œ"
data "î","ï","Î","Ï"
data "ç","Ç"
' Lettres accentuée UTF-8, voir: https://www.utf8-chartable.de/unicode-utf8-table.pl
ns% = 0: RESTORE: READ a$: WHILE a$ <>"Carspe": READ a$: END_WHILE: READ a$
WHILE a$<>"F": ns%=ns%+1: csp$(ns%) = a$: READ a$: END_WHILE
FOR k% = 1 TO ns%: READ a$: crm$(k%) = a$: NEXT k%
' lecture intégrale du fichier htm dans la variable a$
FILEBIN_OPEN_READ 9,f$: k% = FILEBIN_SIZE(9): FILEBIN_CLOSE 9
FILE_OPEN_READ 9,f$: FILE_READBUF 9,a$,k%: FILE_CLOSE 9
k% = INSTR(a$,"</head>"): IF k%=0 THEN message "Pas de balise </head>": EXIT_SUB
a$ = MID$(a$,k%+7,LEN(a$))
' suppression des balises <script...>...</script> et <style...>...</style> et de
' leur contenu
k% = INSTR(a$,"<script")
WHILE k%>0
k1% =INSTR(a$,"</script>"): a$ = LEFT$(a$,k%-1)+MID$(a$,k1%+9,LEN(a$))
k% = INSTR(a$,"<script")
END_WHILE
k% = INSTR(a$,"<style")
WHILE k%>0
k1% =INSTR(a$,"</style>"): a$ = LEFT$(a$,k%-1)+MID$(a$,k1%+8,LEN(a$))
k% = INSTR(a$,"<style")
END_WHILE
' remplacement des balises </tr> par des sauts de ligne
k% = INSTR(a$,"</tr>")
WHILE k%>0: a$=LEFT$(a$,k%-1)+cr$+MID$(a$,k%+5,LEN(a$)): k%=INSTR(a$,"</tr>"): END_WHILE
' suppression de toutes les balises (on garde les sauts de lignes, pour lisibilité)
k% = INSTR(a$,"<")
WHILE k% > 0
k1% = INSTR_POS(a$,">",k%): ba$ = MID$(a$,k%,k1%-k%+1): ' balise trouvée
IF LEFT$(ba$,3) = "<br": ' saut de ligne
a$ = LEFT$(a$,k%-1)+CHR$(13)+CHR$(10)+MID$(a$,k1%+1,LEN(a$))
ELSE
a$ = LEFT$(a$,k%-1)+MID$(a$,k1%+1,LEN(a$)): ' suppression
END_IF
k% = INSTR(a$,"<")
END_WHILE
' remplacement des caractères spéciaux
FOR k% = 1 TO ns%
s$ = csp$(k%): k1% = INSTR(a$,s$)
WHILE k1%>0
a$ = LEFT$(a$,k1%-1)+crm$(k%)+MID$(a$,k1%+LEN(s$),LEN(a$))
k1% = INSTR(a$,s$)
END_WHILE
NEXT k%
' suppression des rc/al redondants
k% = INSTR(a$,CHR$(13)+CHR$(10)+CHR$(13)+CHR$(10))
WHILE k%>0
a$ = LEFT$(a$,k%-1)+MID$(a$,k%+2,LEN(a$))
k% = INSTR(a$,CHR$(13)+CHR$(10)+CHR$(13)+CHR$(10))
END_WHILE
FILE_OPEN_WRITE 9,ft$: FILE_WRITELN 9,a$: FILE_CLOSE 9
END_SUB