1
0
Fork 0
mirror of https://github.com/seanmorley15/AdventureLog.git synced 2025-07-21 05:49:37 +02:00

fix: add warning for interrupted download-countries command due to memory issues

This commit is contained in:
Sean Morley 2025-01-13 14:12:05 -05:00
parent f27453824a
commit a766a07ea1
2 changed files with 173 additions and 95 deletions

View file

@ -52,8 +52,13 @@ EOF
fi fi
# Sync the countries and world travel regions
# Sync the countries and world travel regions # Sync the countries and world travel regions
python manage.py download-countries python manage.py download-countries
if [ $? -eq 137 ]; then
>&2 echo "WARNING: The download-countries command was interrupted. This is likely due to lack of memory allocated to the container or the host. Please try again with more memory."
exit 1
fi
cat /code/adventurelog.txt cat /code/adventurelog.txt

View file

@ -1,11 +1,9 @@
import os import os
import requests
import uuid
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
import requests
from worldtravel.models import Country, Region, City from worldtravel.models import Country, Region, City
from django.db import transaction from django.db import transaction
import ijson import ijson
from django.conf import settings
import psutil import psutil
def get_memory_usage(): def get_memory_usage():
@ -17,15 +15,22 @@ def log_memory_usage(stage):
memory_usage = get_memory_usage() memory_usage = get_memory_usage()
print(f"Memory usage at {stage}: {memory_usage / 1024 / 1024:.2f} MB") print(f"Memory usage at {stage}: {memory_usage / 1024 / 1024:.2f} MB")
from django.conf import settings
COUNTRY_REGION_JSON_VERSION = settings.COUNTRY_REGION_JSON_VERSION COUNTRY_REGION_JSON_VERSION = settings.COUNTRY_REGION_JSON_VERSION
media_root = settings.MEDIA_ROOT media_root = settings.MEDIA_ROOT
def saveCountryFlag(country_code): def saveCountryFlag(country_code):
# For standards, use the lowercase country_code
country_code = country_code.lower() country_code = country_code.lower()
flags_dir = os.path.join(media_root, 'flags') flags_dir = os.path.join(media_root, 'flags')
# Check if the flags directory exists, if not, create it
if not os.path.exists(flags_dir): if not os.path.exists(flags_dir):
os.makedirs(flags_dir) os.makedirs(flags_dir)
# Check if the flag already exists in the media folder
flag_path = os.path.join(flags_dir, f'{country_code}.png') flag_path = os.path.join(flags_dir, f'{country_code}.png')
if os.path.exists(flag_path): if os.path.exists(flag_path):
print(f'Flag for {country_code} already exists') print(f'Flag for {country_code} already exists')
@ -46,8 +51,8 @@ class Command(BaseCommand):
parser.add_argument('--force', action='store_true', help='Force download the countries+regions+states.json file') parser.add_argument('--force', action='store_true', help='Force download the countries+regions+states.json file')
def handle(self, **options): def handle(self, **options):
log_memory_usage("start")
force = options['force'] force = options['force']
batch_size = 250
countries_json_path = os.path.join(settings.MEDIA_ROOT, f'countries+regions+states-{COUNTRY_REGION_JSON_VERSION}.json') countries_json_path = os.path.join(settings.MEDIA_ROOT, f'countries+regions+states-{COUNTRY_REGION_JSON_VERSION}.json')
if not os.path.exists(countries_json_path) or force: if not os.path.exists(countries_json_path) or force:
res = requests.get(f'https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/{COUNTRY_REGION_JSON_VERSION}/json/countries%2Bstates%2Bcities.json') res = requests.get(f'https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/{COUNTRY_REGION_JSON_VERSION}/json/countries%2Bstates%2Bcities.json')
@ -69,16 +74,30 @@ class Command(BaseCommand):
self.stdout.write(self.style.SUCCESS('Latest country, region, and state data already downloaded.')) self.stdout.write(self.style.SUCCESS('Latest country, region, and state data already downloaded.'))
return return
insert_id = uuid.uuid4()
with transaction.atomic(): with transaction.atomic():
# Process data in chunks using ijson
f = open(countries_json_path, 'rb') f = open(countries_json_path, 'rb')
parser = ijson.items(f, 'item') parser = ijson.items(f, 'item')
existing_countries = {}
for country in Country.objects.iterator():
existing_countries[country.country_code] = country
existing_regions = {}
for region in Region.objects.iterator():
existing_regions[region.id] = region
existing_cities = {}
for city in City.objects.iterator():
existing_cities[city.id] = city
region_batch = [] countries_to_create = []
city_batch = [] regions_to_create = []
existing_region_ids = set() countries_to_update = []
existing_city_ids = set() regions_to_update = []
cities_to_create = []
cities_to_update = []
processed_country_codes = set()
processed_region_ids = set()
processed_city_ids = set()
for country in parser: for country in parser:
country_code = country['iso2'] country_code = country['iso2']
@ -88,102 +107,156 @@ class Command(BaseCommand):
longitude = round(float(country['longitude']), 6) if country['longitude'] else None longitude = round(float(country['longitude']), 6) if country['longitude'] else None
latitude = round(float(country['latitude']), 6) if country['latitude'] else None latitude = round(float(country['latitude']), 6) if country['latitude'] else None
country_obj, _ = Country.objects.update_or_create( processed_country_codes.add(country_code)
if country_code in existing_countries:
country_obj = existing_countries[country_code]
country_obj.name = country_name
country_obj.subregion = country_subregion
country_obj.capital = country_capital
country_obj.longitude = longitude
country_obj.latitude = latitude
countries_to_update.append(country_obj)
else:
country_obj = Country(
name=country_name,
country_code=country_code, country_code=country_code,
defaults={ subregion=country_subregion,
'name': country_name, capital=country_capital,
'subregion': country_subregion, longitude=longitude,
'capital': country_capital, latitude=latitude
'longitude': longitude,
'latitude': latitude,
'insert_id': insert_id
}
) )
countries_to_create.append(country_obj)
saveCountryFlag(country_code) saveCountryFlag(country_code)
log_memory_usage(country_code) # self.stdout.write(self.style.SUCCESS(f'Country {country_name} prepared'))
if country['states']: if country['states']:
for state in country['states']: for state in country['states']:
state_id = f"{country_code}-{state['state_code']}" if state['state_code'] else f"{country_code}-00" name = state['name']
state_id = f"{country_code}-{state['state_code']}"
latitude = round(float(state['latitude']), 6) if state['latitude'] else None
longitude = round(float(state['longitude']), 6) if state['longitude'] else None
# Ensure no duplicate regions # Check for duplicate regions
if state_id not in existing_region_ids: if state_id in processed_region_ids:
self.stdout.write(self.style.ERROR(f'State {state_id} already processed'))
continue
log_memory_usage(f"State {state_id} processing")
processed_region_ids.add(state_id)
if state_id in existing_regions:
region_obj = existing_regions[state_id]
region_obj.name = name
region_obj.country = country_obj
region_obj.longitude = longitude
region_obj.latitude = latitude
regions_to_update.append(region_obj)
else:
region_obj = Region( region_obj = Region(
id=state_id, id=state_id,
name=state['name'], name=name,
country=country_obj, country=country_obj,
longitude=state['longitude'], longitude=longitude,
latitude=state['latitude'], latitude=latitude
insert_id=insert_id
) )
region_batch.append(region_obj) regions_to_create.append(region_obj)
existing_region_ids.add(state_id) # self.stdout.write(self.style.SUCCESS(f'State {state_id} prepared'))
log_memory_usage(state_id)
# Handle cities and avoid duplicates
if 'cities' in state and len(state['cities']) > 0: if 'cities' in state and len(state['cities']) > 0:
for city in state['cities']: for city in state['cities']:
city_id = f"{state_id}-{city['id']}" city_id = f"{state_id}-{city['id']}"
city_name = city['name']
latitude = round(float(city['latitude']), 6) if city['latitude'] else None
longitude = round(float(city['longitude']), 6) if city['longitude'] else None
if city_id not in existing_city_ids: # Check for duplicate cities
if city_id in processed_city_ids:
self.stdout.write(self.style.ERROR(f'City {city_id} already processed'))
continue
processed_city_ids.add(city_id)
if city_id in existing_cities:
city_obj = existing_cities[city_id]
city_obj.name = city_name
city_obj.region = region_obj
city_obj.longitude = longitude
city_obj.latitude = latitude
cities_to_update.append(city_obj)
else:
city_obj = City( city_obj = City(
id=city_id, id=city_id,
name=city['name'], name=city_name,
region=region_obj, region=region_obj,
longitude=city['longitude'], longitude=longitude,
latitude=city['latitude'], latitude=latitude
insert_id=insert_id
) )
city_batch.append(city_obj) cities_to_create.append(city_obj)
existing_city_ids.add(city_id) # self.stdout.write(self.style.SUCCESS(f'City {city_id} prepared'))
# Bulk insert regions in smaller batches else:
if len(region_batch) >= 100: state_id = f"{country_code}-00"
Region.objects.bulk_create( processed_region_ids.add(state_id)
region_batch, if state_id in existing_regions:
update_conflicts=True, region_obj = existing_regions[state_id]
batch_size=100, region_obj.name = country_name
update_fields=['name', 'country', 'longitude', 'latitude', 'insert_id'], region_obj.country = country_obj
unique_fields=['id'] regions_to_update.append(region_obj)
else:
region_obj = Region(
id=state_id,
name=country_name,
country=country_obj
) )
region_batch.clear() regions_to_create.append(region_obj)
# self.stdout.write(self.style.SUCCESS(f'Region {state_id} prepared for {country_name}'))
# Process in batches
for i in range(0, len(countries_to_create), batch_size):
batch = countries_to_create[i:i + batch_size]
Country.objects.bulk_create(batch)
self.stdout.write(self.style.SUCCESS(f'Processed countries batch {i//batch_size + 1}/{(len(countries_to_create)-1)//batch_size + 1}'))
log_memory_usage(f"Country batch {i//batch_size + 1}/{(len(countries_to_create)-1)//batch_size + 1}")
# Bulk insert cities in smaller batches for i in range(0, len(regions_to_create), batch_size):
if len(city_batch) >= 100: batch = regions_to_create[i:i + batch_size]
City.objects.bulk_create( Region.objects.bulk_create(batch)
city_batch, self.stdout.write(self.style.SUCCESS(f'Processed regions batch {i//batch_size + 1}/{(len(regions_to_create)-1)//batch_size + 1}'))
update_conflicts=True, log_memory_usage(f"Region batch {i//batch_size + 1}/{(len(regions_to_create)-1)//batch_size + 1}")
batch_size=100,
update_fields=['name', 'region', 'longitude', 'latitude', 'insert_id'],
unique_fields=['id']
)
city_batch.clear()
# Final insertion of any remaining regions and cities for i in range(0, len(cities_to_create), batch_size):
if region_batch: batch = cities_to_create[i:i + batch_size]
Region.objects.bulk_create( City.objects.bulk_create(batch)
region_batch, self.stdout.write(self.style.SUCCESS(f'Processed cities batch {i//batch_size + 1}/{(len(cities_to_create)-1)//batch_size + 1}'))
update_conflicts=True, log_memory_usage(f"City batch {i//batch_size + 1}/{(len(cities_to_create)-1)//batch_size + 1}")
batch_size=100,
update_fields=['name', 'country', 'longitude', 'latitude', 'insert_id'],
unique_fields=['id']
)
if city_batch: # Process updates in batches
City.objects.bulk_create( for i in range(0, len(countries_to_update), batch_size):
city_batch, batch = countries_to_update[i:i + batch_size]
update_conflicts=True, Country.objects.bulk_update(batch, ['name', 'subregion', 'capital', 'longitude', 'latitude'])
batch_size=100, self.stdout.write(self.style.SUCCESS(f'Updated countries batch {i//batch_size + 1}/{(len(countries_to_update)-1)//batch_size + 1}'))
update_fields=['name', 'region', 'longitude', 'latitude', 'insert_id'], log_memory_usage(f"Country update batch {i//batch_size + 1}/{(len(countries_to_update)-1)//batch_size + 1}")
unique_fields=['id']
)
self.stdout.write(self.style.SUCCESS('Regions and cities created')) for i in range(0, len(regions_to_update), batch_size):
batch = regions_to_update[i:i + batch_size]
Region.objects.bulk_update(batch, ['name', 'country', 'longitude', 'latitude'])
self.stdout.write(self.style.SUCCESS(f'Updated regions batch {i//batch_size + 1}/{(len(regions_to_update)-1)//batch_size + 1}'))
log_memory_usage(f"Region update batch {i//batch_size + 1}/{(len(regions_to_update)-1)//batch_size + 1}")
# Clean up old data for i in range(0, len(cities_to_update), batch_size):
Country.objects.exclude(insert_id=insert_id).delete() batch = cities_to_update[i:i + batch_size]
Region.objects.exclude(insert_id=insert_id).delete() City.objects.bulk_update(batch, ['name', 'region', 'longitude', 'latitude'])
City.objects.exclude(insert_id=insert_id).delete() self.stdout.write(self.style.SUCCESS(f'Updated cities batch {i//batch_size + 1}/{(len(cities_to_update)-1)//batch_size + 1}'))
log_memory_usage(f"City update batch {i//batch_size + 1}/{(len(cities_to_update)-1)//batch_size + 1}")
self.stdout.write(self.style.SUCCESS('All data imported successfully and old data cleaned up')) # Delete countries and regions that are no longer in the data
Country.objects.exclude(country_code__in=processed_country_codes).delete()
log_memory_usage("Extra Countries deleted")
Region.objects.exclude(id__in=processed_region_ids).delete()
log_memory_usage("Extra Regions deleted")
City.objects.exclude(id__in=processed_city_ids).delete()
log_memory_usage("Extra Cities deleted")
self.stdout.write(self.style.SUCCESS('All data imported successfully'))