#/Users/cynthia/.pyenv/shims/python

'''
Code to parse the WebAnno event+sentiment annotations conducted in the framework of the NewsDNA project.
Code written by Cynthia Van Hee (LT3) in the framework of the Master's Thesis of Siel Debouver (2020)

Changes made in this copy of the code:
	- Typesystem.xml has changed into TypeSystem.xml
	- Irony feature (polarity triggers) is extracted as well
	- Text spans (indices) of polarity trigger annotations are extracted

Changes with respect to this copy of the code: /home/cynthia/projects/NewsDNA/sentiment_events/sentiment_SENTIRE/IAA/WebAnno_parsing:
	- Code added to extract the sentence in which an event occurs
	- Script arguments are not hard-coded anymore so that processes can run in the background via a shell script
'''


import os
import sys
import argparse
import csv
from zipfile import ZipFile
import shutil
import pickle

from pathlib import Path #Read path

try:
	from cassis import * #Package to parse CAS XMI objects (https://github.com/dkpro/dkpro-cassis)
except ModuleNotFoundError as error:
	print("Warning: activate the virtual env!")
	raise


from Documentobject import Documentobject
from AnnotationsObject import AnnotationsObject, AspectObject, EntityObject, PolarityTriggersObject


documentobjectlist = []
warnings = []

def folderprepro(rawAnnotationFilesPath):
	'''
	Reads and unzips raw WebAnno export files (XMI)
	'''
	print('Reading instances...')
	count = 0
	for folder in os.listdir(rawAnnotationFilesPath):
		if folder.endswith('.xmi') or folder.endswith('.txt'): 
			count += 1
			oripath = os.path.join(rawAnnotationFilesPath, folder)
			newfolderpath = oripath.split('.')[0]+'_xmi' #Change the extension to avoid Python confusion the folder with a file
			shutil.copytree(oripath, newfolderpath)
			shutil.rmtree(oripath)
	print('Succesfully renamed {0} folders'.format(str(count)))
	for annotationsfolder in os.listdir(rawAnnotationFilesPath):
		if annotationsfolder.endswith('_xmi'):
			annpath = os.path.join(rawAnnotationFilesPath, annotationsfolder)
			if annpath.endswith('_xmi'):
				for zipf in (x for x in os.listdir(annpath) if x.endswith('.zip')):
					unzip(os.path.join(annpath,zipf), annpath)
					os.remove(os.path.join(annpath,zipf)) #Remove file after unzipping
			else:
				print('Warning: folder not recognised during preprocessing:\t{0}'.format(annotationsfolder))
	return rawAnnotationFilesPath


def unzip(zipfile, destination):
	zf = ZipFile(zipfile, 'r')
	zf.extractall(destination)
	zf.close()
	return destination


def createDocumentObjectDict(xmiFilesFolder, annotatornames, needPrepro='no'):
	'''
	Creates a nested dict: key = documentname, value = dictionary with annotator names as keys and DocumentObject as value
	Important: names in `annotatornames' should match the names in the xmi files
	'''
	if needPrepro.lower() == 'yes': #Optional, because preprocessing only needs to be done once
		folderprepro(xmiFilesFolder)
	upperfolderlist = os.listdir(xmiFilesFolder)
	print('Reading annotation files...')
	allAnnotationsDict = {}
	for i, annotationsfolder in enumerate((x for x in upperfolderlist if x.endswith('_xmi'))):
		print('Processing file ' + str(i+1) + '...')
		annotationsfolderpath = os.path.join(xmiFilesFolder, annotationsfolder)
		documentname = os.path.basename(annotationsfolderpath)
		allAnnotationsDict[documentname] = {}
		if os.path.isdir(annotationsfolderpath):
			for f in os.listdir(annotationsfolderpath): #Remove all irrelevant files 
				if f.split('.xmi')[0].lower() not in [a.lower() for a in annotatornames] and f != 'TypeSystem.xml':
					os.remove(os.path.join(annotationsfolderpath, f))
			# print('Document {0}: '.format(documentname))
			#For each annotator, parse the XMI file and create a document object that will be stored in allAnnotationsDict
			for annotator in annotatornames:
				allAnnotationsDict[documentname][annotator] = {}
				xmifilename = os.path.join(annotationsfolderpath, (annotator + '.xmi'))
				typesystemfilename = os.path.join(annotationsfolderpath, 'TypeSystem.xml')
				assert os.path.isfile(xmifilename), 'Warning: no xmi file found for annotator {0}'.format(annotator)
				assert os.path.isfile(typesystemfilename), 'Warning: no typesystem file found for annotator {0}'.format(annotator)
				doctitle, text, casobject = get_metadata_cas(xmifilename, typesystemfilename)
				if not doctitle in allAnnotationsDict[documentname][annotator].keys():
					allAnnotationsDict[documentname][annotator] = Documentobject(doctitle, text, casobject)
				else:
					warnings.append('Warning while parsing XMI files: document {0} occurs twice for annotator {1}'.format(doctitle, annotator))
		else:
			warnings.append('Warning while parsing XMI files: unknown folder: {0}'.format(annotationsfolderpath))
	return allAnnotationsDict


def get_metadata_cas(casfile, typesystemfile):
	'''
	Extracts document title, document raw text and cas object containing the annotations
	'''
	cas, typesystem = load_cas(casfile,typesystemfile)
	text = []
	for sofa in cas.sofas:
		text.append(sofa.sofaString)
	docmetadata = cas.select('de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData')
	doctitle = ','.join([el.documentTitle for el in cas.select('de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData')])
	return doctitle, text, cas


def load_cas(casfile,typesystemfile):
	with open(typesystemfile, 'rb') as f:
		typesystem = load_typesystem(f)
	with open(casfile, 'rb') as f:
		cas = load_cas_from_xmi(f,typesystem = typesystem)
	return cas, typesystem


def get_sentences(casobject):
	'''
	Extracts the sentence in which an aspect occurs
	'''
	sentencedict = {}
	sentenceslist = list(casobject.select('de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'))
	for sl in sentenceslist:
		sentencerange = range(sl.begin, sl.end+1)
		sentencetext = sl.get_covered_text()
		assert sentencerange not in sentencedict
		sentencedict[sentencerange] = sentencetext
	return sentencedict


def parse(allAnnotationsDict, annotatornames):
	'''
	Fills the DocumentObjects in allAnnotationsDict (1 per document and per annotator) with the annotations parsed from the cas object
	'''
	for documentname, annotatordict in allAnnotationsDict.items():
		# print('>> For document {0}, annotations found from {1}'.format(documentname, ', '.join(annotatordict.keys())))
		for annotatorname, docobject in annotatordict.items():
			docname = docobject.docname
			articletext = docobject.text
			cas = docobject.casobject
			docobject.annotations = AnnotationsObject()
			#Get all document sentences and their indices
			sentencedict = get_sentences(cas)
			#Get aspect, entity and polarity trigger annotations for annotatorname
			documentaspectslist = list(cas.select('webanno.custom.Aspects'))
			documententitieslist = list(cas.select('webanno.custom.NamedEntityRecogniction'))
			documentpolaritytriggerslist = list(cas.select('webanno.custom.PolarityTriggers'))
			documentpolaritylinklist = list(cas.select('webanno.custom.LinkPolarityTriggers'))
			#Create a dictionary of polarity trigger governors and dependents (to ensure that
			#dependents are not included as separate polarity triggers, since they are combined with their governors)
			polaritytriggergovernorsdict = {}
			polaritytriggerdependentsdict = {}
			#Create a list of aspect objects
			for i, aspect in enumerate(documentaspectslist):
				#Fill the aspect objects
				docobject.annotations.aspectlist.append(AspectObject())
				docobject.annotations.aspectlist[i].aspecttext.append(aspect.get_covered_text())
				docobject.annotations.aspectlist[i].category = aspect.FeatureCategory #-> in plaats hiervan code voor verzamelen van de subcategorie aspecten?
				docobject.annotations.aspectlist[i].sameclause = False
				# if aspect.Sameclause: #Get all parts of non-consecutive aspects
				# 	# print('Aspect text', docobject.annotations.aspectlist[i].aspecttext)
				# 	docobject.annotations.aspectlist[i].sameclause = True
				# 	sameclausetext = aspect.Sameclause[0].target.get_covered_text()
				# 	# print('Same clause text', sameclausetext)
				# 	docobject.annotations.aspectlist[i].aspecttext.append(sameclausetext)
				#Based on the aspect range, find the sentence in which the aspect occurs (an aspect can never cross sentence boundaries!)
				#sentencefound = False
				#for key, val in sentencedict.items():
					#if aspect.begin in key and aspect.end in key:
						#docobject.annotations.aspectlist[i].sentence = val
						#docobject.annotations.aspectlist[i].sentencerange = key
						#Index of each sentence starts at 0 so adapt the indices of the aspect accordingly to extract it
						# docobject.annotations.aspectlist[i].sentencewithoutaspect = (val[:aspect.begin-key[0]], val[aspect.end-key[0]:])
						#sentencefound = True
				#if not sentencefound:
					#print('Warning: sentence not found for this aspect: ', docname, docobject.annotations.aspectlist[i].aspecttext)
			#Create a list of entity objects
			for j, entity in enumerate(documententitieslist):
				#Fill the entity objects
				docobject.annotations.entitylist.append(EntityObject())
				docobject.annotations.entitylist[j].entitytype = entity.NEType
				docobject.annotations.entitylist[j].entitytext.append(entity.get_covered_text())
				#docobject.annotations.entitylist[j].polarity = entity.NEpolarity #-> weglaten en focus op het NEType
				# docobject.annotations.entitylist[j].metonymy = entity.Metonymy
			#Get polarity trigger dependents and governors
			for k, polaritytriggerlink in enumerate(documentpolaritylinklist):
				governor_id = polaritytriggerlink.Governor.xmiID
				dependent_id = polaritytriggerlink.Dependent.xmiID
				if not governor_id in polaritytriggergovernorsdict:
					polaritytriggergovernorsdict[governor_id] = []
				polaritytriggergovernorsdict[governor_id].append(polaritytriggerlink)
				if not dependent_id in polaritytriggerdependentsdict:
					polaritytriggerdependentsdict[dependent_id] = []
				polaritytriggerdependentsdict[dependent_id].append(polaritytriggerlink)
			#Get polarity trigger polarities and linked aspects
			all_dependents = list(polaritytriggerdependentsdict.keys())
			seen_dependents = []
			all_governors = list(polaritytriggergovernorsdict.keys())
			seen_governors = []
			dep_govs = [x for x in all_dependents if x in all_governors]
			#Create object for each polarity trigger
			for l, polaritytrigger in enumerate(documentpolaritytriggerslist):
				docobject.annotations.polaritytriggerslist.append(PolarityTriggersObject())
				docobject.annotations.polaritytriggerslist[l].id = polaritytrigger.xmiID
				docobject.annotations.polaritytriggerslist[l].text.append(polaritytrigger.get_covered_text())
				docobject.annotations.polaritytriggerslist[l].irony.append(polaritytrigger.Irony)
				docobject.annotations.polaritytriggerslist[l].polarity.append(polaritytrigger.Polarity)
				docobject.annotations.polaritytriggerslist[l].span.append((polaritytrigger.begin, polaritytrigger.end))
				if polaritytrigger.LinkFE:
					for indx, el in enumerate(polaritytrigger.LinkFE):
						docobject.annotations.polaritytriggerslist[l].target.append(polaritytrigger.LinkFE[indx].target.get_covered_text())
						docobject.annotations.polaritytriggerslist[l].aspectrelationpolarity.append(polaritytrigger.LinkFE[indx].role)
				if docobject.annotations.polaritytriggerslist[l].id in all_dependents:
					docobject.annotations.polaritytriggerslist[l].isdependent = True
					for el in polaritytriggerdependentsdict[docobject.annotations.polaritytriggerslist[l].id]:
						docobject.annotations.polaritytriggerslist[l].governors.append(el.Governor.xmiID)
				if docobject.annotations.polaritytriggerslist[l].id in all_governors:
					for el in polaritytriggergovernorsdict[docobject.annotations.polaritytriggerslist[l].id]:
						docobject.annotations.polaritytriggerslist[l].dependents.append(el.Dependent.xmiID)
				# print('PT: ', docobject.annotations.polaritytriggerslist[l].text)
				# print('Target: ', docobject.annotations.polaritytriggerslist[l].target)
				# print('Dependents: ', docobject.annotations.polaritytriggerslist[l].dependents)
				# print('Polarity: ', docobject.annotations.polaritytriggerslist[l].polarity)
				# print('Span: ', docobject.annotations.polaritytriggerslist[l].span)
				# print('Irony: ', docobject.annotations.polaritytriggerslist[l].irony)
			#Add al dependent info to the governor object
			for dep_id in polaritytriggerdependentsdict.keys():
				govs = [x.Governor.xmiID for x in polaritytriggerdependentsdict[dep_id]]
				# print('DEP ID', dep_id)
				# print('GOVS', govs)
				for m, polaritytrigger in enumerate(documentpolaritytriggerslist):
					# Get dependent text
					if docobject.annotations.polaritytriggerslist[m].id == dep_id:
						dep_id = docobject.annotations.polaritytriggerslist[m].id
						# print('dep id', dep_id)
						dep_text = docobject.annotations.polaritytriggerslist[m].text
						dep_polarity = docobject.annotations.polaritytriggerslist[m].polarity
						dep_span = docobject.annotations.polaritytriggerslist[m].span
						dep_irony = docobject.annotations.polaritytriggerslist[m].irony
						dep_target = docobject.annotations.polaritytriggerslist[m].target
						dep_aspectrelationpolarity = docobject.annotations.polaritytriggerslist[m].aspectrelationpolarity
						#Copy al dependent info to the governor object
						for g in govs:
							for n, poltrig in enumerate(documentpolaritytriggerslist):
								if docobject.annotations.polaritytriggerslist[n].id == g:
									assert dep_id not in docobject.annotations.polaritytriggerslist[n].dependentdict
									docobject.annotations.polaritytriggerslist[n].dependentdict[dep_id] = {'id':[dep_id], 'text':dep_text, 'polarity':dep_polarity, 'aspectrelationpolarity':dep_aspectrelationpolarity, 'target':dep_target, 'irony':dep_irony, 'span':dep_span}

			#If a governor is a dependent of another governor, add its info to the higher governor
			for o in sorted(polaritytriggergovernorsdict.keys(), reverse=True):
				for p, poltrig in enumerate(documentpolaritytriggerslist):
					if docobject.annotations.polaritytriggerslist[p].id == o:
						if docobject.annotations.polaritytriggerslist[p].isdependent:
							found = False
							# print('>>> GOV ALSO DEP')
							# print('\tID:', o)
							i_d = docobject.annotations.polaritytriggerslist[p].id
							# print('\tTEXT', docobject.annotations.polaritytriggerslist[p].text)
							# print('\tDEP:', docobject.annotations.polaritytriggerslist[p].dependents)
							# print('\tGOV:', docobject.annotations.polaritytriggerslist[p].governors)
							assert len(docobject.annotations.polaritytriggerslist[p].governors) == 1
							#Find uts governor and add its dependent to its own governor
							for q, poltrig in enumerate(documentpolaritytriggerslist):
								if docobject.annotations.polaritytriggerslist[p].governors[0] == docobject.annotations.polaritytriggerslist[q].id:
									found = True
									# print(docobject.annotations.polaritytriggerslist[q].dependentdict)
									for el in docobject.annotations.polaritytriggerslist[p].dependents:
										assert el not in docobject.annotations.polaritytriggerslist[q].dependentdict
										docobject.annotations.polaritytriggerslist[q].dependentdict[i_d]['id'].append(docobject.annotations.polaritytriggerslist[p].dependentdict[el]['id'])
										docobject.annotations.polaritytriggerslist[q].dependentdict[i_d]['text'].extend(docobject.annotations.polaritytriggerslist[p].dependentdict[el]['text'])
										docobject.annotations.polaritytriggerslist[q].dependentdict[i_d]['polarity'].extend(docobject.annotations.polaritytriggerslist[p].dependentdict[el]['polarity'])
										docobject.annotations.polaritytriggerslist[q].dependentdict[i_d]['aspectrelationpolarity'].extend(docobject.annotations.polaritytriggerslist[p].dependentdict[el]['aspectrelationpolarity'])
										docobject.annotations.polaritytriggerslist[q].dependentdict[i_d]['target'].extend(docobject.annotations.polaritytriggerslist[p].dependentdict[el]['target'])
										docobject.annotations.polaritytriggerslist[q].dependentdict[i_d]['irony'].extend(docobject.annotations.polaritytriggerslist[p].dependentdict[el]['irony'])
										docobject.annotations.polaritytriggerslist[q].dependentdict[i_d]['span'].extend(docobject.annotations.polaritytriggerslist[p].dependentdict[el]['span'])
							assert found

			#Do some checks (e.g. whether polarity triggers have a polarity annotation and are linked to their aspect)
			for r, polaritytrigger in enumerate(documentpolaritytriggerslist):
				if not docobject.annotations.polaritytriggerslist[r].target:
					aspectlinkfound = False
					if docobject.annotations.polaritytriggerslist[r].dependentdict:
						for s, t in enumerate(docobject.annotations.polaritytriggerslist[r].dependentdict):
							if docobject.annotations.polaritytriggerslist[r].dependentdict[t]['target']:
								aspectlinkfound = True
					if not aspectlinkfound:
						warnings.append('PT either dependent on governor or not linked to aspect:\t{0}\tIsDependent:{1}\t(Annotator: {2}, Document:{3})'.format(docobject.annotations.polaritytriggerslist[r].text,docobject.annotations.polaritytriggerslist[r].isdependent, annotatorname,docname))
				if not docobject.annotations.polaritytriggerslist[r].polarity:
					if docobject.annotations.polaritytriggerslist[r].aspectrelationpolarity:
						warnings.append('Aspect relation polarity annotated, but span polarity is None for this trigger:\t{0}\t(Annotator: {1}, Document:{2})'.format(docobject.annotations.polaritytriggerslist[r].text,annotatorname,docname))
					else:
						if docobject.annotations.polaritytriggerslist[r].dependentdict:
							for s, t in enumerate(docobject.annotations.polaritytriggerslist[r].dependentdict):
								if not docobject.annotations.polaritytriggerslist[r].dependentdict['polarity'] or None in docobject.annotations.polaritytriggerslist[r].dependentdict['polarity']:
									if not docobject.annotations.polaritytriggerslist[r].dependentdict['aspectrelationpolarity'] or None in docobject.annotations.polaritytriggerslist[r].dependentdict['aspectrelationpolarity']:
										warnings.append('No polarity annotated for this trigger:\t{0}\t(Annotator: {1}, Document:{2})'.format(docobject.annotations.polaritytriggerslist[r].text,annotatorname,docname))
						else:
							warnings.append('No polarity annotated for this trigger:\t{0}\t(Annotator: {1}, Document:{2})'.format(docobject.annotations.polaritytriggerslist[r].text,annotatorname,docname))
				if list(set(docobject.annotations.polaritytriggerslist[r].polarity)) != list(set(docobject.annotations.polaritytriggerslist[r].polarity)):
					warnings.append('Different polarities annotated for this trigger:\t{0}\t(Annotator: {1}\tDocument:{2}\tPT1:{3}\tPT2{4})'.format(docobject.annotations.polaritytriggerslist[r].text,annotatorname,docname, list(set(docobject.annotations.polaritytriggerslist[r].polarity)), list(set(docobject.annotations.polaritytriggerslist[r].polarity))))
				if docobject.annotations.polaritytriggerslist[r].dependentdict:
					same = True
					for s, t in enumerate(docobject.annotations.polaritytriggerslist[r].dependentdict):
						if list(set(docobject.annotations.polaritytriggerslist[r].dependentdict[t]['polarity'])) != list(set(docobject.annotations.polaritytriggerslist[r].dependentdict[t]['aspectrelationpolarity'])):
							same = False
					if not same:
						warnings.append('Different polarities annotated for this trigger:\t{0}\t(Annotator: {1}\tDocument:{2}\tPT1:{3}\tPT2:{4})'.format(docobject.annotations.polaritytriggerslist[r].text,annotatorname,docname, list(set(docobject.annotations.polaritytriggerslist[r].dependentdict[t]['polarity'])), list(set(docobject.annotations.polaritytriggerslist[r].dependentdict[t]['aspectrelationpolarity']))))
	return allAnnotationsDict


def write_annotations_aspectcategory(allAnnotationsDict, annotatornames, outfilefolder):
	allaspectsdict = get_all_aspects(allAnnotationsDict)
	aspecttermslist =[]
	'''
	Gets, for every unique aspect (allaspectslist), its subcategory given by each annotator
	'''
	with open(os.path.join(outfilefolder,'annotations_aspectcategory.csv'), 'w') as csvfile:
		outfilewriter = csv.writer(csvfile, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
		outfilewriter.writerow(['Document', 'Aspect'] + [a for a in annotatornames] + ['Sentence'])
		for documentname, annotatordict in allAnnotationsDict.items():
			#Get a list of all aspects per document, consider the first annotator in the dict, as the aspects are the same for each annotator.
			allaspects = allaspectsdict[documentname][annotatornames[0]]
			allaspectstrings = ['+'.join(x.aspecttext) for x in allaspects] #Aspects can be stored as lists of two non-consecutive spans
			allaspectsentences = [x.sentence for x in allaspects]
			# allaspectsentenceswithoutaspect = [x.sentencewithoutaspect for x in allaspects]
			seen = []
			for aspectText, aspectSent in zip(allaspectstrings, allaspectsentences):
				aspecttermslist.append(aspectText)
				rowelements = [] #Rowelements will be written to output file
				rowelements.append(documentname)
				rowelements.append(aspectText) #rowelements.append(aspectText.replace('\xad', ''))
				for a in annotatornames:
					cat = []
					aspectobjectslist = allaspectsdict[documentname][a]
					for aspobj in aspectobjectslist:
						found = False
						if '+'.join(aspobj.aspecttext) == aspectText and aspobj.sentence == aspectSent:
							if aspobj.category == None:
								cat.append('None')
							else:
								cat.append(aspobj.category)
							found = True
						else:
							continue
						if not found:
							print('Warning: aspect\t{0}\t{1}\tfrom annotator {2} not found in allaspectslist.'.format(aspobj.aspecttext, documentname, a))
							warnings.append('Write annotations for aspect polarities: aspect\t{0}\t{1}\tfrom annotator {2} not found in allaspectslist.'.format(aspobj.aspecttext, documentname, a))
					if len(cat) > 1 and 'None' in cat: #Aspect polarity can be 'None' if the Aspect is part of a linked aspect span and its polarity was added to the other part
						indx_none = pol.index('None') #Remove 'None' polarities
						cat.pop(indx_none)
					rowelements.append(','.join(list(set(cat))))
				rowelements.append(aspectSent) #rowelements.append(aspectSent.replace('\xad', ''))
				# rowelements.append(tuple([x.replace('\xad', '') for x in aspectSentWith]))
				if not (aspectText,aspectSent) in seen: #Discard 100% duplicates (i.e. aspects that are annotated twice in the sentence)
					outfilewriter.writerow(rowelements)
				seen.append((aspectText,aspectSent))
	with open(os.path.join(outfilefolder, 'aspectTerms.txt'), 'w', encoding = 'utf-8') as f:
		for el in aspecttermslist:
			f.write(el + '\n')


def get_all_aspects(allAnnotationsDict):
	'''
	Creates a dictionary storing, for each document, all aspect objects per annotator
	'''
	allaspectsdict = {}
	for documentname, annotatordict in allAnnotationsDict.items():
		allaspectstringslist = [] #To check whether the list of aspects is the same for each annotator (which should be the case)
		allaspectsdict[documentname] = {}
		for annotatorname, docobject in annotatordict.items():
			allaspectsdict[documentname][annotatorname] = []
			allaspectstrings = []
			aspectobjectlist = docobject.annotations.aspectlist
			allaspectsdict[documentname][annotatorname].extend(asp for asp in aspectobjectlist)
			allaspectstrings.extend(['+'.join(asp.aspecttext) for asp in aspectobjectlist]) #Aspects can be stored as lists of two non-consecutive spans
			allaspectstringslist.append(['**'.join(allaspectstrings)])
		check = [''.join(x) for x in allaspectstringslist]
		assert len(list(set(check))) == 1, "Warning: unequal number of aspects by annotators."
	return allaspectsdict



def write_annotations_entitycategory(allAnnotationsDict, annotatornames, outfilefolder):
	'''
	Gets commonly annotated entities and their polarity (NEW: NEtype) given by each annotator
	'''
	with open(os.path.join(outfilefolder, 'annotations_entitycategory.csv'), 'w', encoding="utf-8") as csvfile:
		outfilewriter = csv.writer(csvfile, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
		outfilewriter.writerow(['Document', 'Entity']+ [a for a in annotatornames])
		for documentname, annotatordict in allAnnotationsDict.items():
			entitiesdict = {}
			#Create a dict with all entity annotations for each annotator
			for a in annotatornames:
				docobject = allAnnotationsDict[documentname][a]
				entitylist = docobject.annotations.entitylist
				for ent in entitylist:
					entity_text = ''.join(ent.entitytext)
					if not entity_text in entitiesdict.keys():
						entitiesdict[entity_text] = {}
						for name in annotatornames:
							entitiesdict[entity_text][name] = []
						entitiesdict[entity_text][a].append(ent.entitytype)
					else:
						entitiesdict[entity_text][a].append(ent.entitytype)
			#Write annotations to csv file
			for entity, annotatordict in entitiesdict.items():
				# print(entity, annotatordict)
				all_annotated = []
				for annotator in annotatordict:
					if annotatordict[annotator]:
						all_annotated.append(annotator)
				#Check whether every annotator has an annotation for this entity. If so, add it to the csv file
				if sorted(list(set(all_annotated))) == sorted(annotatornames):
					rowelements = [documentname, entity.replace('\xad', '')]
					for a in annotatornames:
						rowelements.append(','.join(list(set([str(x) for x in annotatordict[a]]))).replace('\xad', ''))
					outfilewriter.writerow(rowelements)


def write_annotations_polarityTriggers(allAnnotationsDict, annotatornames, outfilefolder):
	'''
	Creates a list of unique polarity triggers per aspect (i.e. annotated by all annotators) and their polarity
	'''
	allaspectsdict = get_all_aspects(allAnnotationsDict)
	with open(os.path.join(outfilefolder, 'annotations_polarityTriggers.csv'), 'w') as csvfile:
		outfilewriter = csv.writer(csvfile, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
		headerdata = []
		for a in annotatornames:
			headerdata.extend(['Span ' + a, 'Polarity '+ a, 'Irony '+ a])
		outfilewriter.writerow(['Document', 'Aspect', 'Aspect category', 'Polarity Trigger'] + headerdata)
		for documentname, annotatordict in allAnnotationsDict.items():
			#Get a list of all aspects per document, consider the first annotator in the dict, as the aspects are the same for each annotator.
			allaspects = allaspectsdict[documentname][annotatornames[0]]
			allaspectstrings = ['+'.join(x.aspecttext) for x in allaspects]
			for aspectObj, aspectText in zip(allaspects, allaspectstrings):
				poltriggernotarget = [] #Keep track of polarity triggers without aspect target
				ptdict = {}
				aspectcategory = aspectObj.category
				sent_range = aspectObj.sentencerange #Allow to assert PT is in the same sentence as the aspect
				for a in annotatornames:
					ptdict[aspectText] = {}
					ptdict[aspectText][a] = {'span': [], 'pol':[], 'iro':[]}
					docobject = allAnnotationsDict[documentname][a]
					polaritytriggers = [x for x in docobject.annotations.polaritytriggerslist if not x.isdependent] #Exclude dependents, they are included in the governor span
					for pt in polaritytriggers:
						#Assert PT is in the same sentence as aspect (otherwise confusion may exist with similar aspects in other sentences).
						#CAREFUL: linked PTs that are in other sentences are excluded this way!
						#ptbeginspan, ptendspan = pt.span[0]
						#if sent_range:
							#if ptbeginspan in sent_range:
								#Join PT text with that of its dependents
						if not pt.dependentdict: #PT has no dependent(s)
							text = ','.join(pt.text)
							target = pt.target
							if len(target) >= 1:
								for indx, el in enumerate(target):
									if aspectText == target[indx] or target[indx] in aspectText.split('+'):
										if not text in ptdict:
											ptdict[text] = {}
											for n in annotatornames:
												ptdict[text][n] = {'span': [], 'pol':[], 'iro':[]}
										if not pt.polarity or None in pt.polarity:
											ptdict[text][a]['pol'].append(pt.aspectrelationpolarity) #If the polarity is not annotated on the PT itself, but on the linking arrow between PT and aspect
										else:
											ptdict[text][a]['pol'].append(pt.polarity)
										ptdict[text][a]['iro'].append(pt.irony)
										ptdict[text][a]['span'].append(pt.span)
							else:
								poltriggernotarget.append((a, text, 'no dependent'))
						else: #Join text PT with dependent(s)
							if not pt.governors: #Make sure it is the highest governor
								text = pt.text
								target = pt.target
								dependents = pt.dependents
								polarity = pt.polarity
								aspectrelationpolarity = pt.aspectrelationpolarity
								irony = pt.irony
								span = pt.span
								# if len(pt.dependentdict) == 1:
								for key, value in pt.dependentdict.items():
									dependent_text = value['text']
									dependent_target = value['target']
									dependent_polarity = value['polarity']
									dependent_aspectrelationpolarity = value['aspectrelationpolarity']
									dependent_irony = value['irony']
									dependent_span = value['span']
									final_text = ','.join(text + dependent_text)
									final_target = list(set(target+dependent_target)) #Either governor or dependent(s) can be linked to the target (or both), so collect all targets and remove duplicates
									print('FINAL TARG', final_target)
									final_polarity = list(set(polarity+dependent_polarity)) #Same as for targets
									final_aspectrelationpolarity = list(set(aspectrelationpolarity+dependent_aspectrelationpolarity))
									final_irony = list(set(irony+dependent_irony))
									final_span = list(set(span+dependent_span))
									#Find the polarity that is linked to the aspect under investigation
									if len(final_target) >= 1:
										if aspectText == final_target[0] or final_target[0] in aspectText.split('+'):
											if not final_text in ptdict:
												ptdict[final_text] = {}
												for n in annotatornames:
													ptdict[final_text][n] = {'span':[], 'pol':[], 'iro':[]}
											if not pt.polarity or None in pt.polarity:
												ptdict[final_text][a]['pol'].append(final_aspectrelationpolarity)
											else:
												ptdict[final_text][a]['pol'].append(final_polarity)
											ptdict[final_text][a]['iro'].append(final_irony)
											ptdict[final_text][a]['span'].append(final_span)
									else:
										poltriggernotarget.append((a, ','.join(text), 'dependent'))
				#Output some warnings
				print('>>', documentname)
				print(poltriggernotarget)
				for elment in list(set(poltriggernotarget)):
					anno, text, dependent = elment
					warnings.append('>>> No target annotated for poltrigger:\t{0}\t{1}(Annotator:{2}, Document:{3}'.format(text, dependent, anno, documentname))
					print('No target annotated for poltrigger:', text, dependent, '\tAnnotator: ', anno, '\tDocument:', documentname)
				for poltrigger, anndict in ptdict.items():
					rowelements =[documentname, aspectText.replace('\xad', ''), aspectcategory, poltrigger.replace('\xad', '')]
					for a in annotatornames:
						polaritieslist = list(set([x for y in ptdict[poltrigger][a]['pol'] for x in y]))
						ironylist = list(set([x for y in ptdict[poltrigger][a]['iro'] for x in y]))
						spanlist = list(set([x for y in ptdict[poltrigger][a]['span'] for x in y]))
						if spanlist:
							rowelements.append(spanlist)
						else:
							rowelements.append('/')
						if polaritieslist:
							rowelements.append(','.join([str(n) for n in polaritieslist]))
						else:
							rowelements.append('/')
						if ironylist:
							rowelements.append(','.join([str(i) for i in ironylist]))
						else:
							rowelements.append('/')
					outfilewriter.writerow(rowelements)



def main():

	parser = argparse.ArgumentParser(description='Parse WebAnno XMI files.')
	parser.add_argument('folder', help='Path to folder where XMI files are stored.')
	parser.add_argument('anns', nargs='+', help='The names of the annotators as a space-separate list.')
	# The preprocessing is only needed once, right after WebAnno export. The function changes the foldername (.tsv or .xmi to _xmi) and unzips the files inside
	parser.add_argument('prepro', help='Pass "yes" or "no" depending on whether preprocessing of the raw input dir is necessary')
	parser.add_argument('outf', help='Path to folder where output files will be stored.')
	parser.add_argument('task', help='Annotations you wish to parse: aspects, entities or polaritytriggers.')

	args = parser.parse_args()
	print('Script arguments:', args)

	if args.prepro:
		print('> Preprocessing ...')
	allAnnotationsDict = createDocumentObjectDict(args.folder, args.anns, args.prepro)
	allAnnotationsDict = parse(allAnnotationsDict, args.anns)

	print('> Extracting annotations ...')
	assert args.task, "Warning: which annotation do you want to export? Choose between `aspects', `entities' and `polaritytriggers'."
	with open(os.path.join(args.outf,'warnings.txt'), 'w') as warningsfile:
		for w in warnings:
			warningsfile.write(w + '\n')

	print('> Script arguments: type of annotation exported: "', args.task, '"')
	if args.task.lower() in ['aspects']:
		write_annotations_aspectcategory(allAnnotationsDict, args.anns, args.outf)
	elif args.task.lower() in ['entities', 'named_entities', 'namedentities']:
		write_annotations_entitycategory(allAnnotationsDict, args.anns, args.outf)
	elif args.task.lower() in ['poltriggers', 'polaritytriggers', 'polarity_triggers']:
		write_annotations_polarityTriggers(allAnnotationsDict, args.anns, args.outf)
	else:
		print("Warning: annotation you wish to export is not recognised! Choose between `aspects', `entities' and `polaritytriggers'.")

if __name__ == "__main__":
	main()
