From a766a07ea1502b1aabe9daad263c9b095b1239e3 Mon Sep 17 00:00:00 2001 From: Sean Morley Date: Mon, 13 Jan 2025 14:12:05 -0500 Subject: [PATCH] fix: add warning for interrupted download-countries command due to memory issues --- backend/entrypoint.sh | 5 + .../management/commands/download-countries.py | 263 +++++++++++------- 2 files changed, 173 insertions(+), 95 deletions(-) diff --git a/backend/entrypoint.sh b/backend/entrypoint.sh index 26c3e0d..50e034a 100644 --- a/backend/entrypoint.sh +++ b/backend/entrypoint.sh @@ -52,8 +52,13 @@ EOF fi +# Sync the countries and world travel regions # Sync the countries and world travel regions python manage.py download-countries +if [ $? -eq 137 ]; then + >&2 echo "WARNING: The download-countries command was interrupted. This is likely due to lack of memory allocated to the container or the host. Please try again with more memory." + exit 1 +fi cat /code/adventurelog.txt diff --git a/backend/server/worldtravel/management/commands/download-countries.py b/backend/server/worldtravel/management/commands/download-countries.py index 4556697..77dff60 100644 --- a/backend/server/worldtravel/management/commands/download-countries.py +++ b/backend/server/worldtravel/management/commands/download-countries.py @@ -1,11 +1,9 @@ import os -import requests -import uuid from django.core.management.base import BaseCommand +import requests from worldtravel.models import Country, Region, City from django.db import transaction import ijson -from django.conf import settings import psutil def get_memory_usage(): @@ -17,15 +15,22 @@ def log_memory_usage(stage): memory_usage = get_memory_usage() print(f"Memory usage at {stage}: {memory_usage / 1024 / 1024:.2f} MB") +from django.conf import settings + COUNTRY_REGION_JSON_VERSION = settings.COUNTRY_REGION_JSON_VERSION + media_root = settings.MEDIA_ROOT def saveCountryFlag(country_code): + # For standards, use the lowercase country_code country_code = country_code.lower() flags_dir = os.path.join(media_root, 'flags') + + # Check if the flags directory exists, if not, create it if not os.path.exists(flags_dir): os.makedirs(flags_dir) + # Check if the flag already exists in the media folder flag_path = os.path.join(flags_dir, f'{country_code}.png') if os.path.exists(flag_path): print(f'Flag for {country_code} already exists') @@ -46,8 +51,8 @@ class Command(BaseCommand): parser.add_argument('--force', action='store_true', help='Force download the countries+regions+states.json file') def handle(self, **options): - log_memory_usage("start") force = options['force'] + batch_size = 250 countries_json_path = os.path.join(settings.MEDIA_ROOT, f'countries+regions+states-{COUNTRY_REGION_JSON_VERSION}.json') if not os.path.exists(countries_json_path) or force: res = requests.get(f'https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/{COUNTRY_REGION_JSON_VERSION}/json/countries%2Bstates%2Bcities.json') @@ -68,17 +73,31 @@ class Command(BaseCommand): else: self.stdout.write(self.style.SUCCESS('Latest country, region, and state data already downloaded.')) return - - insert_id = uuid.uuid4() with transaction.atomic(): + # Process data in chunks using ijson f = open(countries_json_path, 'rb') parser = ijson.items(f, 'item') + existing_countries = {} + for country in Country.objects.iterator(): + existing_countries[country.country_code] = country + existing_regions = {} + for region in Region.objects.iterator(): + existing_regions[region.id] = region + existing_cities = {} + for city in City.objects.iterator(): + existing_cities[city.id] = city - region_batch = [] - city_batch = [] - existing_region_ids = set() - existing_city_ids = set() + countries_to_create = [] + regions_to_create = [] + countries_to_update = [] + regions_to_update = [] + cities_to_create = [] + cities_to_update = [] + + processed_country_codes = set() + processed_region_ids = set() + processed_city_ids = set() for country in parser: country_code = country['iso2'] @@ -88,102 +107,156 @@ class Command(BaseCommand): longitude = round(float(country['longitude']), 6) if country['longitude'] else None latitude = round(float(country['latitude']), 6) if country['latitude'] else None - country_obj, _ = Country.objects.update_or_create( - country_code=country_code, - defaults={ - 'name': country_name, - 'subregion': country_subregion, - 'capital': country_capital, - 'longitude': longitude, - 'latitude': latitude, - 'insert_id': insert_id - } - ) + processed_country_codes.add(country_code) + + if country_code in existing_countries: + country_obj = existing_countries[country_code] + country_obj.name = country_name + country_obj.subregion = country_subregion + country_obj.capital = country_capital + country_obj.longitude = longitude + country_obj.latitude = latitude + countries_to_update.append(country_obj) + else: + country_obj = Country( + name=country_name, + country_code=country_code, + subregion=country_subregion, + capital=country_capital, + longitude=longitude, + latitude=latitude + ) + countries_to_create.append(country_obj) saveCountryFlag(country_code) - log_memory_usage(country_code) + # self.stdout.write(self.style.SUCCESS(f'Country {country_name} prepared')) - if country['states']: - for state in country['states']: - state_id = f"{country_code}-{state['state_code']}" if state['state_code'] else f"{country_code}-00" - - # Ensure no duplicate regions - if state_id not in existing_region_ids: + if country['states']: + for state in country['states']: + name = state['name'] + state_id = f"{country_code}-{state['state_code']}" + latitude = round(float(state['latitude']), 6) if state['latitude'] else None + longitude = round(float(state['longitude']), 6) if state['longitude'] else None + + # Check for duplicate regions + if state_id in processed_region_ids: + self.stdout.write(self.style.ERROR(f'State {state_id} already processed')) + continue + log_memory_usage(f"State {state_id} processing") + + processed_region_ids.add(state_id) + + if state_id in existing_regions: + region_obj = existing_regions[state_id] + region_obj.name = name + region_obj.country = country_obj + region_obj.longitude = longitude + region_obj.latitude = latitude + regions_to_update.append(region_obj) + else: + region_obj = Region( + id=state_id, + name=name, + country=country_obj, + longitude=longitude, + latitude=latitude + ) + regions_to_create.append(region_obj) + # self.stdout.write(self.style.SUCCESS(f'State {state_id} prepared')) + + if 'cities' in state and len(state['cities']) > 0: + for city in state['cities']: + city_id = f"{state_id}-{city['id']}" + city_name = city['name'] + latitude = round(float(city['latitude']), 6) if city['latitude'] else None + longitude = round(float(city['longitude']), 6) if city['longitude'] else None + + # Check for duplicate cities + if city_id in processed_city_ids: + self.stdout.write(self.style.ERROR(f'City {city_id} already processed')) + continue + + processed_city_ids.add(city_id) + + if city_id in existing_cities: + city_obj = existing_cities[city_id] + city_obj.name = city_name + city_obj.region = region_obj + city_obj.longitude = longitude + city_obj.latitude = latitude + cities_to_update.append(city_obj) + else: + city_obj = City( + id=city_id, + name=city_name, + region=region_obj, + longitude=longitude, + latitude=latitude + ) + cities_to_create.append(city_obj) + # self.stdout.write(self.style.SUCCESS(f'City {city_id} prepared')) + + else: + state_id = f"{country_code}-00" + processed_region_ids.add(state_id) + if state_id in existing_regions: + region_obj = existing_regions[state_id] + region_obj.name = country_name + region_obj.country = country_obj + regions_to_update.append(region_obj) + else: region_obj = Region( id=state_id, - name=state['name'], - country=country_obj, - longitude=state['longitude'], - latitude=state['latitude'], - insert_id=insert_id + name=country_name, + country=country_obj ) - region_batch.append(region_obj) - existing_region_ids.add(state_id) - log_memory_usage(state_id) + regions_to_create.append(region_obj) + # self.stdout.write(self.style.SUCCESS(f'Region {state_id} prepared for {country_name}')) + # Process in batches + for i in range(0, len(countries_to_create), batch_size): + batch = countries_to_create[i:i + batch_size] + Country.objects.bulk_create(batch) + self.stdout.write(self.style.SUCCESS(f'Processed countries batch {i//batch_size + 1}/{(len(countries_to_create)-1)//batch_size + 1}')) + log_memory_usage(f"Country batch {i//batch_size + 1}/{(len(countries_to_create)-1)//batch_size + 1}") - # Handle cities and avoid duplicates - if 'cities' in state and len(state['cities']) > 0: - for city in state['cities']: - city_id = f"{state_id}-{city['id']}" - - if city_id not in existing_city_ids: - city_obj = City( - id=city_id, - name=city['name'], - region=region_obj, - longitude=city['longitude'], - latitude=city['latitude'], - insert_id=insert_id - ) - city_batch.append(city_obj) - existing_city_ids.add(city_id) + for i in range(0, len(regions_to_create), batch_size): + batch = regions_to_create[i:i + batch_size] + Region.objects.bulk_create(batch) + self.stdout.write(self.style.SUCCESS(f'Processed regions batch {i//batch_size + 1}/{(len(regions_to_create)-1)//batch_size + 1}')) + log_memory_usage(f"Region batch {i//batch_size + 1}/{(len(regions_to_create)-1)//batch_size + 1}") - # Bulk insert regions in smaller batches - if len(region_batch) >= 100: - Region.objects.bulk_create( - region_batch, - update_conflicts=True, - batch_size=100, - update_fields=['name', 'country', 'longitude', 'latitude', 'insert_id'], - unique_fields=['id'] - ) - region_batch.clear() + for i in range(0, len(cities_to_create), batch_size): + batch = cities_to_create[i:i + batch_size] + City.objects.bulk_create(batch) + self.stdout.write(self.style.SUCCESS(f'Processed cities batch {i//batch_size + 1}/{(len(cities_to_create)-1)//batch_size + 1}')) + log_memory_usage(f"City batch {i//batch_size + 1}/{(len(cities_to_create)-1)//batch_size + 1}") - # Bulk insert cities in smaller batches - if len(city_batch) >= 100: - City.objects.bulk_create( - city_batch, - update_conflicts=True, - batch_size=100, - update_fields=['name', 'region', 'longitude', 'latitude', 'insert_id'], - unique_fields=['id'] - ) - city_batch.clear() + # Process updates in batches + for i in range(0, len(countries_to_update), batch_size): + batch = countries_to_update[i:i + batch_size] + Country.objects.bulk_update(batch, ['name', 'subregion', 'capital', 'longitude', 'latitude']) + self.stdout.write(self.style.SUCCESS(f'Updated countries batch {i//batch_size + 1}/{(len(countries_to_update)-1)//batch_size + 1}')) + log_memory_usage(f"Country update batch {i//batch_size + 1}/{(len(countries_to_update)-1)//batch_size + 1}") - # Final insertion of any remaining regions and cities - if region_batch: - Region.objects.bulk_create( - region_batch, - update_conflicts=True, - batch_size=100, - update_fields=['name', 'country', 'longitude', 'latitude', 'insert_id'], - unique_fields=['id'] - ) + for i in range(0, len(regions_to_update), batch_size): + batch = regions_to_update[i:i + batch_size] + Region.objects.bulk_update(batch, ['name', 'country', 'longitude', 'latitude']) + self.stdout.write(self.style.SUCCESS(f'Updated regions batch {i//batch_size + 1}/{(len(regions_to_update)-1)//batch_size + 1}')) + log_memory_usage(f"Region update batch {i//batch_size + 1}/{(len(regions_to_update)-1)//batch_size + 1}") - if city_batch: - City.objects.bulk_create( - city_batch, - update_conflicts=True, - batch_size=100, - update_fields=['name', 'region', 'longitude', 'latitude', 'insert_id'], - unique_fields=['id'] - ) + for i in range(0, len(cities_to_update), batch_size): + batch = cities_to_update[i:i + batch_size] + City.objects.bulk_update(batch, ['name', 'region', 'longitude', 'latitude']) + self.stdout.write(self.style.SUCCESS(f'Updated cities batch {i//batch_size + 1}/{(len(cities_to_update)-1)//batch_size + 1}')) + log_memory_usage(f"City update batch {i//batch_size + 1}/{(len(cities_to_update)-1)//batch_size + 1}") - self.stdout.write(self.style.SUCCESS('Regions and cities created')) + # Delete countries and regions that are no longer in the data + Country.objects.exclude(country_code__in=processed_country_codes).delete() + log_memory_usage("Extra Countries deleted") + Region.objects.exclude(id__in=processed_region_ids).delete() + log_memory_usage("Extra Regions deleted") + City.objects.exclude(id__in=processed_city_ids).delete() + log_memory_usage("Extra Cities deleted") - # Clean up old data - Country.objects.exclude(insert_id=insert_id).delete() - Region.objects.exclude(insert_id=insert_id).delete() - City.objects.exclude(insert_id=insert_id).delete() - self.stdout.write(self.style.SUCCESS('All data imported successfully and old data cleaned up')) \ No newline at end of file + self.stdout.write(self.style.SUCCESS('All data imported successfully')) \ No newline at end of file