conversion_conll.py

# -*- coding: utf-8 -*-
"""Conversion_CONLL.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1pP9ciu96kMf1cuJ5RY4k_9YwkoiXNNYH
    
    
This script is intended to convert the URS and XML format from the original DEMOCRAT corpus
into the more common conll format suited for coreference annotation
"""


import glob, os, re, sys

import pandas as pd
import spacy

import lxml
from bs4 import BeautifulSoup as bs

"""
Description of the final conll file

Column	Type	Description </br>
1.	Document ID	
> This is a variation on the document filename</br>

2.	Part number	
> Some files are divided into multiple parts numbered as 000, 001, 002, ... etc.</br>
3.	Word number	
4.	Word itself
>	This is the token as segmented/tokenized in the Treebank. Initially the *_skel file contain the placeholder [WORD] which gets replaced by the actual token from the Treebank which is part of the OntoNotes release.
5.	Part-of-Speech	
6.	Parse bit
>	This is the bracketed structure broken before the first open parenthesis in the parse, and the word/part-of-speech leaf replaced with a *. The full parse can be created by substituting the asterix with the "([pos] [word])" string (or leaf) and concatenating the items in the rows of that column. When the parse information is missing, the first word of a sentence is tagged as "(TOP*" and the last word is tagged as "*)" and all intermediate words are tagged with a "*".
7.	Predicate lemma
>	The predicate lemma is mentioned for the rows for which we have semantic role information. All other rows are marked with a "-"
8.	Predicate Frameset ID
>	This is the PropBank frameset ID of the predicate in Column 7.
9.	Word sense
>	This is the word sense of the word in Column 3.
10.	Speaker/Author
>This is the speaker or author name where available. Mostly in Broadcast Conversation and Web Log data.
11.	Named Entities
>These columns identifies the spans representing various named entities.
12. :N	Predicate Arguments
>	There is one column each of predicate argument structure information for the predicate mentioned in Column 7.
* N	Coreference	Coreference 
> chain information encoded in a parenthesis structure.

Transformation
"""


class DocumentConllConverter():
  '''
    Converter object that takes two tree objets (urs and xml) and extracts infos on the 12 columns described above
  '''
  def __init__(self,xml_tree,urs_tree,document_name,convert_pos):
    self.xml_tree = xml_tree
    self.urs_tree = urs_tree
    self.part_number , self.word_number = 0,0
    self.speaker = '_'
    self.sentence_starts ,self.sentence_ends = True, False
    self.sentence_number = 1
    self.unclosed_corefs , self.mentions_spans = set(), []
    self.convert_pos = convert_pos

    self.mention_unique = re.compile("#u\-MENTION\-\d+$")

    genres ={'non narratif':"nn", 'mixte (narratif et non narratif)':"mi", 'narratif':"na"}
    genre = DF_METADATA[DF_METADATA.fichier==document_name].iat[0,5]
    self.document_id = genres.get(genre)+'_'+document_name

    self.output_string = ''
    self.filter_coreference_spans()
          
    
  def filter_coreference_spans(self):
    '''
      filtering coreferences...
    '''
    self.unique_mentions = {}
    
    path_mentions = './standOff/annotations[@type="coreference"]/annotationGrp[@subtype="MENTION"]/span'
    all_mentions = self.urs_tree.xpath(path_mentions)
    for mention in all_mentions[::-1]:
      span = (mention.get("from"), mention.get('to'))
      if span not in self.unique_mentions:
        self.unique_mentions[span] = mention.get('id')
  
  def from_xml_to_conll(self,output_file,print_frequency=10,document_split_frequency=15,seps={".",".","?","!",";"}):
    '''
      write the final conll file in the given output_file
      Arguments :
      print_frequency: information on the process is printed every 1/print_frequency
      document_split_frequency : the number of sentences in the original document that will be considered a full document in the new conll file
                                Necessary to prevent too long coreference chains that will require too much ressources for later computation
      seps = sentence separators. Characters from which it is possible to end a sentence and starts a new sentence / new document
    '''
    self.sentence_seps = seps
    
    self.output_string+=f"#begin document ({self.document_id}); part {self.part_number}\n"
    tokens =  self.xml_tree.findall('/{*}text/{*}s/{*}w')
    token_number = len(tokens)
    print_period = token_number//print_frequency
    for i, token in enumerate(tokens):
      if (i+1) % (print_period) == 0:
        print(round((i+1)/token_number*100,1), "%")
      word_id = token.get('id')
      if self.sentence_starts:
        self.word_number = 0
      word = token.find('./{*}form').text.replace("’","'")
      if word.strip('-'):
        word = word.strip('-')
      #print(word)
      part_of_speech = token.xpath('./*[contains(@type,"pos")]')[0].text 
      # pos, fropos, frpos, type
      #part_of_speech =token.find('./{*}ana').text

      coreference = self.get_coreference(word_id)
      parse_bit = self.make_parse_bit(word,part_of_speech)
      predicate_lemma,predicate_frameset_id,part_of_speech= self.get_predicate_lemma(token,part_of_speech)
      part_of_speech = self.from_frpos_to_upos(part_of_speech)
      word_sense = '_'
      named_entities = '_'
      predicate_arguments = '_'
      speaker = '_'
      line = (' '*10).join([self.document_id,str(self.part_number),str(self.word_number),
                                  word,part_of_speech,parse_bit,
                                  predicate_lemma,str(predicate_frameset_id),
                                  word_sense,speaker,named_entities,
                                  predicate_arguments,coreference])
      self.output_string += line+'\n'
      self.word_number +=1

      if self.sentence_ends:

        self.sentence_starts = True
        self.sentence_ends = False
        
        self.mentions_spans = []
        if i< len(tokens) -1 and self.sentence_number % document_split_frequency == 0 :
          self.part_number+=1
          self.output_string +=f"#end document\n\n#begin document ({self.document_id}); part {self.part_number}\n"
          self.sentence_number=1
          
        else:
          self.output_string +='\n'
          self.sentence_number+=1
      else:
        self.sentence_starts = False
    
    self.output_string+="#end document\n"

    open(output_file,'w',encoding='utf8').write(self.output_string)

  def from_frpos_to_upos(self,pos):
    ''' 
        maps french part of speech to their universal counterparts
    '''
    matchpos = re.match("[A-Z]+",pos)
    if pos.startswith("CONcoo"):
      upos = 'CCONJ'
    elif pos.startswith("CONsub"):
      upos = 'SCONJ'
    elif matchpos:
      upos = matchpos.group(0)
    else:
      upos = pos
    if upos in self.convert_pos:
      upos= self.convert_pos[upos]
    return  upos

  def get_predicate_lemma(self,token,pos):
    predicate_frameset_id = 0
    if pos == 'ABR':
      doc = NLP(pos)
      pos = doc[0].pos_
    if pos in ['VERB','AUX']:
      lemma  = token.find('//txm:ana[#type="#lemma"]').text
    else:
      lemma = '_'
    return lemma,predicate_frameset_id,pos
  
  def get_coreference(self,word_id):
    '''
        Get the coreference chain of the token if there is one. Otherwise returns '_'
        returns a string with all the chain ids the token belongs following the conll format
    '''
    path_begin = './standOff/annotations[@type="coreference"]/annotationGrp[@subtype="MENTION"]/'
    path_from_to = f'span[{ends_with("from",word_id)} or {ends_with("to",word_id,)}]'

    spans = self.urs_tree.xpath(path_begin+path_from_to) 
    coreferences = []
    #print(spans)
    if spans :
      for span in spans:
        if span.get('id') not in self.unique_mentions.values():
          continue
        
        chains = self.urs_tree.xpath(path_begin.replace("MENTION","CHAINE")+f'link[{ends_with("target",span.get("id"))} or contains(@target,"{span.get("id")} ")]')
        #print("SPAN",word_id,"from"+span.get("from"),"to"+span.get("to"), chains)
        for chain in chains:
          if self.mention_unique.match(chain.get("target")):
            continue
          
          coreference = ''
          chain_id = chain.get('id')
          
          chain_number = re.search('\d+',chain_id).group()
          if span.get('from')==f"text:{word_id}":
            coreference += '('

          coreference += chain_number

          if span.get('to')==f"text:{word_id}":
            coreference +=')'

          if re.match("\d+\)", coreference) and chain_number in self.unclosed_corefs:
            self.unclosed_corefs.remove(chain_number)
          elif   re.match("\(\d+$", coreference) :
            self.unclosed_corefs.add(chain_number)
          coreferences.append(coreference)

    if not coreferences:
      coreferences = ["_"]
    return '|'.join(coreferences)


  def make_parse_bit(self,token,part_of_speech):
    '''
        returns parse bit
        updates the self.sentence_ends necessary to change sentence when possible
    '''
    if (part_of_speech == 'SENT' or token in self.sentence_seps ) and not self.unclosed_corefs and self.word_number > 3:
      self.sentence_ends = True
      parse_bit = '*)'
    elif self.sentence_starts:
      parse_bit = '(TOP'
    else:
      parse_bit = '*'
    return parse_bit

def convert_corpus(file_path,output_dir,print_frequency=10,limit_century=16):
    ''' 
        browses the DEMOCRAT corpus and from each xml/urs file pair creates a new conll file
    '''
  file_list = glob.glob(file_path)
  
  file_list = [f for f in file_list if (DF_METADATA[DF_METADATA.fichier==os.path.basename(f).replace(".xml","")].iat[0,4]) >= limit_century]
  print(file_list)
  file_number = len(file_list)-1
  for i,file_xml in enumerate(file_list):
    file_xml_urs = '/xml-urs/'.join(file_xml.split('/xml/')).replace(".xml",'-urs.xml')

    xml_tree , urs_tree = get_tree(file_xml), get_tree(file_xml_urs)
    dir, base = os.path.split(file_xml)

    output_file = output_dir+'/'+base.replace(".xml",'.conll')
    document_name = base.replace(".xml","")
    print(base, document_name,"file",i+1, "/", file_number)
    converter = DocumentConllConverter(xml_tree,urs_tree,document_name,CONVERT_POS)
    converter.from_xml_to_conll(output_file,print_frequency=print_frequency,document_split_frequency=15)
    print()

def get_tree(file_name):
  return lxml.etree.parse(open(file_name,encoding='utf8'))

def ends_with(attribute,substring):
  return f'substring(@{attribute} , string-length(@{attribute}) - string-length("{substring}")+1) = "{substring}"'

def convert_century_to_int(century):
  '''
    applied to metadata dataframe to convert centuries to int
   '''
  return int(re.search("\d+",century).group())

"""Assemble the multiple conll files into three large files, each for training, development and testing"""

def open_w(f):
  return open(f,"w",encoding="utf8")

def splits_conll_file(f,train_corpus_size,test_corpus_size):
  '''
    splits the conll file in three files of given proportions : train, dev and test
  '''
  dev_corpus_size = 1 - ( train_corpus_size+test_corpus_size)
  train = dev = test = ''
  with open(f) as input_file:
    text = input_file.read()
    docs_number = text.count("#begin document")
    train_end = docs_number*train_corpus_size
    dev_end = train_end + docs_number*dev_corpus_size
    part_number=0
    
    for line in text.splitlines(True):
      document_starts = re.match("#begin document.*?(\d+)\n",line)
      if document_starts:
        part_number = int(document_starts.group(1))
      if part_number < train_end:
        train += line
      elif part_number < dev_end:
        dev += line
      else:
        test+= line

  return train, dev, test

def make_large_conll_files(conll_files_path,conll_large_files_directory,train_corpus_size=0.6,test_corpus_size=0.1):
  '''
    splits each conll file in the directory in three parts 
    and combine them in three large conll files 
  '''
  assert all([0<s<1 for s in (train_corpus_size,test_corpus_size) ])
  if train_corpus_size+test_corpus_size > 0.9:
    print(f"developement corpus too small. It must be at least equal to 10 % of the whole corpus")
  for dir in ["train","dev","test"]:
    os.makedirs(conll_large_files_directory+"/"+dir, exist_ok=True)

  with open_w(conll_large_files_directory+"/train/train.french.v4_gold_conll") as train_output,open_w(conll_large_files_directory+"/dev/dev.french.v4_gold_conll") as dev_output, open_w(conll_large_files_directory+"/test/test.french.v4_gold_conll") as test_output:
    for f in glob.glob(conll_files_path):
      train,dev,test = splits_conll_file(f,train_corpus_size,test_corpus_size)
      train_output.write(train)
      dev_output.write(dev)
      test_output.write(test)
      
if __name__ == '__main__':
    NLP = spacy.load('fr_core_news_sm')
    path_corpus = sys.argv[0]


    CONVERT_POS = {"VER":"VERB","PRO":"PRON","PUN":"PUNCT",'PON':'PUNCT',
                   "NOM":'NOUN','NAM':'PROPN','SENT':'PUNCT','PRE':'ADP',
                   'PRP':'ADP','INJ':'INTJ','KON':'CCONJ','INT':'INTJ',
                   'ETRE':'NUM','OUT':'X',}

    DF_METADATA = pd.read_csv(path_corpus+"DEMOCRAT/democrat_metadata.csv",sep="\t")
    DF_METADATA["siècle_composition"] = DF_METADATA["siècle_composition"].apply(convert_century_to_int)
    DF_METADATA.fichier = DF_METADATA.fichier.apply(lambda x:x.lower())

    convert_corpus(path_corpus+'/DEMOCRAT/democrat/5/data/xml/*.xml',path_corpus+'/DEMOCRAT/democrat/5/data/conll2',print_frequency=10,limit_century=19)

    make_large_conll_files(path_corpus+'/DEMOCRAT/democrat/5/data/conll2/*.conll',path_corpus+'/DEMOCRAT/democrat/5/data/conll2/conll_large',train_corpus_size=0.65,test_corpus_size=0.1)