Legilibre · Changaco · Jan 12, 2019 · Dec 25, 2018 · Jan 12, 2019 · Jan 12, 2019
diff --git a/legi/french.py b/legi/french.py
@@ -8,6 +8,8 @@
 from .utils import add_accentless_fallbacks, strip_down
 
 
+INTRA_WORD_CHARS = "'’.-‐‑–"  # the dot is for abbreviations
+
 ORDINALS = (
     "première|premier|deuxième|seconde|second|troisième|quatrième|cinquième|"
     "sixième|septième|huitième|neuvième|dixième|"

diff --git a/legi/normalize.py b/legi/normalize.py
@@ -20,7 +20,6 @@
     legifrance_url_section, normalize_section_num, section_re, section_type_p,
     sujet_re,
 )
-from .spelling import spellcheck
 from .titles import NATURE_MAP_R_SD, gen_titre, normalize_title, parse_titre
 from .utils import (
     ascii_spaces_re, connect_db, filter_nonalnum, mimic_case, nonword_re,
@@ -353,21 +352,15 @@ def update_article(data):
             is_full_match = len(m.group(0)) == len(num)
             if not is_full_match:
                 offset = m.end(0)
-                part1, part2 = num[:offset], num[offset:]
+                part2 = num[offset:]
                 is_full_match = (
                     part2[:3] == ' : ' or
                     part2.startswith(' relative ') or
                     part2.startswith(' relatif ')
                 )
                 if is_full_match:
-                    if upper_word_re.search(part2):
-                        if spellcheck(part2):
-                            num = part1 + upper_word_re.sub(lower, part2)
-                            count('lowercased subtitle (spellcheck)')
-                        else:
-                            count('still uppercase')
-                            url = legifrance_url_article(article_id, cid)
-                            print("Warning: still uppercase:", repr(num), ' ', url)
+                    if upper_words_percentage(part2) > 0.2:
+                        count('detected a bad title (uppercase)')
                 elif part2.startswith(' aux articles '):
                     # titre tronqué, on essaye de le compléter en extrayant le
                     # premier paragraphe du contenu de l'article
@@ -391,24 +384,11 @@ def update_article(data):
                         print("Warning: échec de la récupération du titre: %r   %s" % (paragraph, url))
             if is_full_match:
                 count('article_titre regexp matched')
-                if num != orig_num:
-                    add_change((orig_num, num))
-                continue
             else:
                 count('article_titre regexp did not match')
                 url = legifrance_url_article(article_id, cid)
                 print("Warning: capture partielle du numéro: %r   %s" % (show_match(m), url))
 
-        if upper_word_re.search(num):
-            if spellcheck(num):
-                num = upper_word_re.sub(lower, num)
-                num = num[0].upper() + num[1:]
-                count('lowercased (spellcheck)')
-            else:
-                count('still uppercase')
-                url = legifrance_url_article(article_id, cid)
-                print("Warning: still uppercase:", repr(num), ' ', url)
-
         if num != orig_num:
             add_change((orig_num, num))
 
@@ -634,16 +614,20 @@ def normalize_text_titles(db, dry_run=False, log_file=None):
         'date_texte': 32,
     }
 
-    update_counts = defaultdict(int)
-    def count_update(k):
-        update_counts[k] += 1
+    counts = defaultdict(int)
 
     changes = defaultdict(int)
+    def add_change(orig_value, new_value):
+        if filter_nonalnum(new_value) == filter_nonalnum(orig_value):
+            # Not worth logging
+            return
+        changes[(orig_value, new_value)] += 1
+
     updates = {}
     orig_values = {}
     q = db.all("""
         SELECT id, titre, titrefull, titrefull_s, nature, num, date_texte, autorite
-          FROM textes_versions
+          FROM textes_versions_brutes_view
     """)
     for row in q:
         text_id, titre_o, titrefull_o, titrefull_s_o, nature_o, num, date_texte, autorite = row
@@ -677,6 +661,7 @@ def count_update(k):
                     elif n_upper_2 > n_upper_1:
                         titre = titrefull[:len_titre]
         if upper_words_percentage(titre) > 0.2:
+            counts['failed to normalize titre (still uppercase)'] += 1
             print('Échec: titre "', titre, '" contient beaucoup de mots en majuscule', sep='')
         if nature != 'CODE':
             anomaly = [False]
@@ -711,8 +696,11 @@ def get_key(key, ignore_not_found=False):
                           sep='')
                     anomaly[0] = True
                 annexe = get_key('annexe', ignore_not_found=True)
-                nature_d = strip_down(get_key('nature'))
+                nature_complète = get_key('nature')
+                nature_d = strip_down(nature_complète)
                 nature_d = NATURE_MAP_R_SD.get(nature_d, nature_d).upper()
+                if ' ' in nature_d:
+                    nature_d = nature_d.split(' ', 1)[0]
                 if nature_d and nature_d != nature:
                     if not nature:
                         nature = nature_d
@@ -725,15 +713,19 @@ def get_key(key, ignore_not_found=False):
                 num_d = get_key('numero', ignore_not_found=True)
                 if num_d and num_d != num and num_d != date_texte:
                     if not num or not num[0].isdigit():
-                        if not annexe:  # On ne veut pas donner le numéro d'un décret à son annexe
+                        if annexe:
+                            # On ne veut pas donner le numéro d'un décret à son annexe,
+                            # mais on ne va pas retirer le numéro du titre non plus
+                            num = num_d
+                        else:
                             if '-' in num_d or nature == 'DECISION':
                                 orig_values['num'] = num
                                 updates['num'] = num = num_d
-                                count_update('num')
+                                counts['updated num'] += 1
                     elif num[-1] == '.' and num[:-1] == num_d:
                         orig_values['num'] = num
                         updates['num'] = num = num_d
-                        count_update('num')
+                        counts['updated num'] += 1
                     else:
                         print('Incohérence: numéro: "', num_d, '" (detecté) ≠ "', num, '" (donné)', sep='')
                         anomaly[0] = True
@@ -743,7 +735,7 @@ def get_key(key, ignore_not_found=False):
                     if not date_texte or date_texte == '2999-01-01':
                         orig_values['date_texte'] = date_texte
                         updates['date_texte'] = date_texte = date_texte_d
-                        count_update('date_texte')
+                        counts['updated date_texte'] += 1
                     elif date_texte_d != date_texte:
                         print('Incohérence: date: "', date_texte_d, '" (detectée) ≠ "', date_texte, '" (donnée)', sep='')
                         anomaly[0] = True
@@ -755,34 +747,46 @@ def get_key(key, ignore_not_found=False):
                         if not autorite:
                             orig_values['autorite'] = autorite
                             updates['autorite'] = autorite = autorite_d
-                            count_update('autorite')
+                            counts['updated autorite'] += 1
                         elif autorite != autorite_d:
                             print('Incohérence: autorité "', autorite_d, '" (detectée) ≠ "', autorite, '" (donnée)', sep='')
                             anomaly[0] = True
                 if not anomaly[0]:
-                    titre = gen_titre(annexe, nature, num, date_texte, calendar, autorite)
+                    titre = gen_titre(annexe, nature_complète, num, date_texte, calendar, autorite)
                     len_titre = len(titre)
                     titrefull_p2 = titrefull[endpos2:]
-                    if titrefull_p2 and titrefull_p2[0] != ' ':
+                    if titrefull_p2 and titrefull_p2[0].isalnum():
                         titrefull_p2 = ' ' + titrefull_p2
                     titrefull = titre + titrefull_p2
+                    if num and titrefull.count(num) != 1:
+                        print((
+                            "Échec: `num` apparaît %i fois dans le `titrefull`: %r\n"
+                            "             construit à partir de `titrefull_o`: %r\n"
+                            "                                 et de `titre_o`: %r"
+                        ) % (titrefull.count(num), titrefull, titrefull_o, titre_o))
+        if titrefull != titre and upper_words_percentage(titrefull) > 0.5:
+            counts['detected a bad titrefull (uppercase)'] += 1
+        if quotes_re.search(titrefull):
+            titrefull = quotes_re.sub(replace_quotes, titrefull)
+            counts['normalized quotes in titrefull'] += 1
         titrefull_s = filter_nonalnum(titrefull)
         if titre != titre_o:
-            count_update('titre')
+            counts['updated titre'] += 1
             orig_values['titre'] = titre_o
             updates['titre'] = titre
+            add_change(titre_o, titre)
         if titrefull != titrefull_o:
-            count_update('titrefull')
+            counts['updated titrefull'] += 1
             orig_values['titrefull'] = titrefull_o
             updates['titrefull'] = titrefull
+            add_change(titrefull_o, titrefull)
         if nature != nature_o:
-            count_update('nature')
+            counts['updated nature'] += 1
             orig_values['nature'] = nature_o
             updates['nature'] = nature
         for col, new_value in updates.items():
             orig_value = orig_values[col]
             assert new_value != orig_value
-            changes[(orig_value, new_value)] += 1
         if titrefull_s != titrefull_s_o:
             updates['titrefull_s'] = titrefull_s
         if updates:
@@ -802,8 +806,7 @@ def get_key(key, ignore_not_found=False):
                     db.insert("textes_versions_brutes", orig_values, replace=True)
                 orig_values.clear()
 
-    print('Done. Updated %i values: %s' %
-          (sum(update_counts.values()), json.dumps(update_counts, indent=4)))
+    print('Done. Result:', json.dumps(counts, indent=4))
 
     if log_file:
         log_file.write("# titres de textes\n")
@@ -817,7 +820,7 @@ def get_key(key, ignore_not_found=False):
 if __name__ == '__main__':
     p = ArgumentParser()
     p.add_argument('db')
-    p.add_argument('what', default='all', choices=[
+    p.add_argument('what', nargs='?', default='all', choices=[
         'all', 'articles_num', 'sections_titres', 'textes_titres'
     ])
     p.add_argument('--dry-run', action='store_true', default=False)

diff --git a/legi/sql/migrations.sql b/legi/sql/migrations.sql
@@ -31,3 +31,18 @@ CREATE VIEW textes_versions_brutes_view AS
 
 -- migration #3
 !RECREATE!
+
+-- migration #4
+DROP VIEW textes_versions_brutes_view;
+CREATE VIEW textes_versions_brutes_view AS
+    SELECT a.dossier, a.cid, a.id,
+           (CASE WHEN b.bits & 1 > 0 THEN b.nature ELSE a.nature END) AS nature,
+           (CASE WHEN b.bits & 2 > 0 THEN b.titre ELSE a.titre END) AS titre,
+           (CASE WHEN b.bits & 4 > 0 THEN b.titrefull ELSE a.titrefull END) AS titrefull,
+           (CASE WHEN b.bits & 8 > 0 THEN b.autorite ELSE a.autorite END) AS autorite,
+           (CASE WHEN b.bits & 16 > 0 THEN b.num ELSE a.num END) AS num,
+           (CASE WHEN b.bits & 32 > 0 THEN b.date_texte ELSE a.date_texte END) AS date_texte,
+           a.titrefull_s
+      FROM textes_versions a
+ LEFT JOIN textes_versions_brutes b
+        ON b.id = a.id AND b.cid = a.cid AND b.dossier = a.dossier AND b.mtime = a.mtime;
diff --git a/legi/sql/schema.sql b/legi/sql/schema.sql
@@ -144,7 +144,8 @@ CREATE VIEW textes_versions_brutes_view AS
            (CASE WHEN b.bits & 4 > 0 THEN b.titrefull ELSE a.titrefull END) AS titrefull,
            (CASE WHEN b.bits & 8 > 0 THEN b.autorite ELSE a.autorite END) AS autorite,
            (CASE WHEN b.bits & 16 > 0 THEN b.num ELSE a.num END) AS num,
-           (CASE WHEN b.bits & 32 > 0 THEN b.date_texte ELSE a.date_texte END) AS date_texte
+           (CASE WHEN b.bits & 32 > 0 THEN b.date_texte ELSE a.date_texte END) AS date_texte,
+           a.titrefull_s
       FROM textes_versions a
  LEFT JOIN textes_versions_brutes b
         ON b.id = a.id AND b.cid = a.cid AND b.dossier = a.dossier AND b.mtime = a.mtime;
diff --git a/legi/titles.py b/legi/titles.py
@@ -7,6 +7,7 @@
 from .fr_calendar import (
     MOIS_GREG, MOIS_REPU, convert_date_to_iso, gregorian_to_republican,
 )
+from .french import INTRA_WORD_CHARS as intra_word
 from .roman import decimal_to_roman
 from .utils import spaces_re, strip_down
 
@@ -32,22 +33,22 @@
 jour_p = r'(?P<jour>1er|[0-9]{1,2})'
 mois_p = r'(?P<mois>%s)' % '|'.join(MOIS_GREG+MOIS_REPU)
 annee_p = r'(?P<annee>[0-9]{4,}|an [IVX]+)'
-numero_re = re.compile(r'n°(?!\s)', re.U)
+numero_re = re.compile(r'n°( ?° ?|(?!\s))', re.U)
 premier_du_mois = re.compile(r'\b1 %(mois_p)s %(annee_p)s' % globals())
 
 ordure_p = r'quinquennale?'
 annexe_p = r"(?P<annexe>Annexe (au |à la |à l'|du ))"
 autorite_p = r'(?P<autorite>ministériel(le)?|du Roi|du Conseil d\'[EÉ]tat)'
-date_p = r'(du )?(?P<date>(%(jour_p)s )?%(mois_p)s( %(annee_p)s)?)( (?P=annee))?' % globals()
-type_loi_p = r'(constitutionnelle|organique|locale)'
+date_p = r'(du )?(?P<date>(%(jour_p)s )?%(mois_p)s( %(annee_p)s)?)( (?P=annee)(?!-))?' % globals()
+type_loi_p = r'(constitutionnelle|organique|locale|de(?: [\w%s]+){1,20}(?= \(?n°))' % intra_word
 nature_p = r'(?P<nature>Arr[êe]t[ée]|Code|Constitution|Convention|Décision|Déclaration|Décret(-loi)?|Loi( %(type_loi_p)s)?|Ordonnance)' % globals()
 nature_strict_p = r'(?P<nature>Arrêté|Code|Constitution|Convention|Décision|Déclaration|Décret(-loi)?|Loi( %(type_loi_p)s)?|Ordonnance)' % globals()
 nature2_re = re.compile(r'(?P<nature2> (constitutionnelle|organique|locale))', re.U | re.I)
-numero_p = r'(n° ?)?(?P<numero>[0-9]+([\-–][0-9]+)*(, ?[0-9]+(-[0-9]+)*)*( et autres)?)\.?'
+numero_p = r'((du )?n[°o.] ?)?(?P<numero>[0-9]+([\-–][0-9]+)*(, ?[0-9]+(-[0-9]+)*)*( et autres)?)\.?'
 titre1_re = re.compile(r'(%(annexe_p)s)?(%(nature_p)s)?' % globals(), re.U | re.I)
-titre1_strict_re = re.compile(r'(%(annexe_p)s)?%(nature_strict_p)s' % globals(), re.U | re.I)
-titre2_re = re.compile(r' ?(%(autorite_p)s|\(?%(date_p)s\)?|%(numero_p)s|%(ordure_p)s)' % globals(), re.U | re.I)
-titre2_strict_re = re.compile(r'( %(autorite_p)s| \(?%(date_p)s\)?| %(numero_p)s| %(ordure_p)s)' % globals(), re.U | re.I)
+titre1_strict_re = re.compile(r'(%(annexe_p)s)?%(nature_strict_p)s?' % globals(), re.U | re.I)
+titre2_re = re.compile(r' ?\(?(%(autorite_p)s|%(date_p)s|%(numero_p)s|%(ordure_p)s)\)?' % globals(), re.U | re.I)
+titre2_strict_re = re.compile(r'( %(autorite_p)s| \(?%(date_p)s\)?| \(?%(numero_p)s\)?| %(ordure_p)s)' % globals(), re.U | re.I)
 
 
 def gen_titre(annexe, nature, num, date_texte, calendar, autorite):
@@ -57,7 +58,7 @@ def gen_titre(annexe, nature, num, date_texte, calendar, autorite):
         titre = annexe[0].upper() + annexe[1:].lower()
         titre += NATURE_MAP.get(nature, nature).lower()
     else:
-        titre = NATURE_MAP.get(nature, nature.title())
+        titre = NATURE_MAP.get(nature, nature[0].upper() + nature[1:].lower())
     if autorite:
         titre += ' ' + AUTORITE_MAP[autorite]
     if num: