| import xml.etree.ElementTree as ET | |
| import jsonlines | |
| import random | |
| from tqdm import tqdm | |
| import argparse | |
| import os | |
| import glob | |
| def get_sentence_data(fn): | |
| """ | |
| Parses a sentence file from the Flickr30K Entities dataset | |
| input: | |
| fn - full file path to the sentence file to parse | |
| output: | |
| a list of dictionaries for each sentence with the following fields: | |
| sentence - the original sentence | |
| phrases - a list of dictionaries for each phrase with the | |
| following fields: | |
| phrase - the text of the annotated phrase | |
| first_word_index - the position of the first word of | |
| the phrase in the sentence | |
| phrase_id - an identifier for this phrase | |
| phrase_type - a list of the coarse categories this | |
| phrase belongs to | |
| """ | |
| with open(fn, 'r') as f: | |
| sentences = f.read().split('\n') | |
| annotations = [] | |
| for sentence in sentences: | |
| if not sentence: | |
| continue | |
| first_word = [] | |
| phrases = [] | |
| phrase_id = [] | |
| phrase_type = [] | |
| words = [] | |
| current_phrase = [] | |
| add_to_phrase = False | |
| for token in sentence.split(): | |
| if add_to_phrase: | |
| if token[-1] == ']': | |
| add_to_phrase = False | |
| token = token[:-1] | |
| current_phrase.append(token) | |
| phrases.append(' '.join(current_phrase)) | |
| current_phrase = [] | |
| else: | |
| current_phrase.append(token) | |
| words.append(token) | |
| else: | |
| if token[0] == '[': | |
| add_to_phrase = True | |
| first_word.append(len(words)) | |
| parts = token.split('/') | |
| phrase_id.append(parts[1][3:]) | |
| phrase_type.append(parts[2:]) | |
| else: | |
| words.append(token) | |
| sentence_data = {'sentence' : ' '.join(words), 'phrases' : []} | |
| for index, phrase, p_id, p_type in zip(first_word, phrases, phrase_id, phrase_type): | |
| sentence_data['phrases'].append({'first_word_index' : index, | |
| 'phrase' : phrase, | |
| 'phrase_id' : p_id, | |
| 'phrase_type' : p_type}) | |
| annotations.append(sentence_data) | |
| return annotations | |
| def get_annotations(fn): | |
| """ | |
| Parses the xml files in the Flickr30K Entities dataset | |
| input: | |
| fn - full file path to the annotations file to parse | |
| output: | |
| dictionary with the following fields: | |
| scene - list of identifiers which were annotated as | |
| pertaining to the whole scene | |
| nobox - list of identifiers which were annotated as | |
| not being visible in the image | |
| boxes - a dictionary where the fields are identifiers | |
| and the values are its list of boxes in the | |
| [xmin ymin xmax ymax] format | |
| """ | |
| tree = ET.parse(fn) | |
| root = tree.getroot() | |
| filename = root.findall('filename')[0].text | |
| size_container = root.findall('size')[0] | |
| anno_info = {'filename': filename, 'boxes' : {}, 'scene' : [], 'nobox' : []} | |
| for size_element in size_container: | |
| anno_info[size_element.tag] = int(size_element.text) | |
| for object_container in root.findall('object'): | |
| for names in object_container.findall('name'): | |
| box_id = names.text | |
| box_container = object_container.findall('bndbox') | |
| if len(box_container) > 0: | |
| if box_id not in anno_info['boxes']: | |
| anno_info['boxes'][box_id] = [] | |
| xmin = int(box_container[0].findall('xmin')[0].text) - 1 | |
| ymin = int(box_container[0].findall('ymin')[0].text) - 1 | |
| xmax = int(box_container[0].findall('xmax')[0].text) - 1 | |
| ymax = int(box_container[0].findall('ymax')[0].text) - 1 | |
| anno_info['boxes'][box_id].append([xmin, ymin, xmax, ymax]) | |
| else: | |
| nobndbox = int(object_container.findall('nobndbox')[0].text) | |
| if nobndbox > 0: | |
| anno_info['nobox'].append(box_id) | |
| scene = int(object_container.findall('scene')[0].text) | |
| if scene > 0: | |
| anno_info['scene'].append(box_id) | |
| return anno_info | |
| def gen_record(sd, an): | |
| filename = an["filename"] | |
| caption = sd["sentence"] | |
| regions = [] | |
| for ph in sd["phrases"]: | |
| if ph["phrase_id"] in an["boxes"]: | |
| for box in an["boxes"][ph["phrase_id"]]: | |
| regions.append( | |
| { | |
| "phrase": ph["phrase"], | |
| "bbox": box | |
| } | |
| ) | |
| if len(regions) < 1: | |
| print("no phrase regions") | |
| return None | |
| return { | |
| "filename": filename, | |
| "height": an["height"], | |
| "width": an["width"], | |
| "grounding":{ | |
| "caption": caption, | |
| "regions": regions | |
| } | |
| } | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="flickr30k entities to ODVG List.") | |
| parser.add_argument("--root", type=str, default="", help="Source anno root") | |
| parser.add_argument("--output_file", type=str, default="flickr30k_entities_odvg.jsonl") | |
| parser.add_argument("--osoi", action="store_true", default=False) | |
| args = parser.parse_args() | |
| print(args) | |
| odvg_anno = [] | |
| sentence_list = os.path.join(args.root, "Sentences") | |
| annotation_list = os.path.join(args.root, "Annotations") | |
| sentence_list = sorted(glob.glob(sentence_list + "/*")) | |
| annotation_list = sorted(glob.glob(annotation_list + "/*")) | |
| len_anno = len(annotation_list) | |
| for idx in tqdm(range(len_anno)): | |
| sds = get_sentence_data(sentence_list[idx]) | |
| an = get_annotations(annotation_list[idx]) | |
| if args.osoi: | |
| sd = sds[random.randint(0, len(sds)-1)] | |
| x = gen_record(sd, an) | |
| if x: | |
| odvg_anno.append(x) | |
| else: | |
| for sd in sds: | |
| x = gen_record(sd, an) | |
| if x: | |
| odvg_anno.append(x) | |
| with jsonlines.open(args.output_file, mode="w") as fwriter: | |
| fwriter.write_all(odvg_anno) |