Spaces:
Running
Running
Major update. Support for 15 LLMs, World Flora Online taxonomy validation, geolocation, 2 OCR methods, significant UI changes, stability improvements, consistent JSON parsing
e91ac58
| import os | |
| from opencage.geocoder import OpenCageGeocode | |
| import pycountry_convert as pc | |
| import warnings | |
| import unicodedata | |
| import pycountry_convert as pc | |
| import warnings | |
| ### TODO 1/24/24 | |
| ### If I want to use this instead of HERE, update the procedure for picking the best/most granular geolocation | |
| def normalize_country_name(name): | |
| return unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII') | |
| def get_continent(country_name): | |
| warnings.filterwarnings("ignore", category=UserWarning, module='pycountry') | |
| continent_code_to_name = { | |
| "AF": "Africa", | |
| "NA": "North America", | |
| "OC": "Oceania", | |
| "AN": "Antarctica", | |
| "AS": "Asia", | |
| "EU": "Europe", | |
| "SA": "South America" | |
| } | |
| try: | |
| normalized_country_name = normalize_country_name(country_name) | |
| # Get country alpha2 code | |
| country_code = pc.country_name_to_country_alpha2(normalized_country_name) | |
| # Get continent code from country alpha2 code | |
| continent_code = pc.country_alpha2_to_continent_code(country_code) | |
| # Map the continent code to continent name | |
| return continent_code_to_name.get(continent_code, '') | |
| except Exception as e: | |
| print(str(e)) | |
| return '' | |
| def validate_coordinates_opencage(record, replace_if_success_geo=False): | |
| GEO_dict = { | |
| 'GEO_method': '', | |
| 'GEO_formatted_full_string': '', | |
| 'GEO_decimal_lat': '', | |
| 'GEO_decimal_long': '', | |
| 'GEO_city': '', | |
| 'GEO_county': '', | |
| 'GEO_state': '', | |
| 'GEO_state_code': '', | |
| 'GEO_country': '', | |
| 'GEO_country_code': '', | |
| 'GEO_continent': '', | |
| } | |
| geocoder = OpenCageGeocode(os.environ['open_cage_geocode']) | |
| query_loc = ', '.join(filter(None, [record.get('municipality', '').strip(), | |
| record.get('county', '').strip(), | |
| record.get('stateProvince', '').strip(), | |
| record.get('country', '').strip()])).strip() | |
| query_decimal = ', '.join(filter(None, [record.get('decimalLatitude', '').strip(), | |
| record.get('decimalLongitude', '').strip()])).strip() | |
| query_verbatim = record.get('verbatimCoordinates', '').strip() | |
| # results = geocoder.geocode('Ann Arbor, Michigan', no_annotations='1') | |
| results = geocoder.geocode(query_loc, no_annotations='1') | |
| if results: | |
| GEO_dict['GEO_method'] = 'OpenCageGeocode_forward' | |
| GEO_dict['GEO_formatted_full_string'] = results[0]['formatted'] | |
| GEO_dict['GEO_decimal_lat'] = results[0]['geometry']['lat'] | |
| GEO_dict['GEO_decimal_long'] = results[0]['geometry']['lng'] | |
| GEO_dict['GEO_city'] = results[0]['components']['city'] | |
| GEO_dict['GEO_county'] = results[0]['components']['county'] | |
| GEO_dict['GEO_state'] = results[0]['components']['state'] | |
| GEO_dict['GEO_state_code'] = results[0]['components']['state_code'] | |
| GEO_dict['GEO_country'] = results[0]['components']['country'] | |
| GEO_dict['GEO_country_code'] = results[0]['components']['country_code'] | |
| GEO_dict['GEO_continent'] = results[0]['components']['continent'] | |
| if GEO_dict['GEO_formatted_full_string'] and replace_if_success_geo: | |
| GEO_dict['GEO_override_OCR'] = True | |
| record['country'] = GEO_dict.get('GEO_country') | |
| record['stateProvince'] = GEO_dict.get('GEO_state') | |
| record['county'] = GEO_dict.get('GEO_county') | |
| record['municipality'] = GEO_dict.get('GEO_city') | |
| return record, GEO_dict | |