1
0
Fork 0
mirror of https://github.com/seanmorley15/AdventureLog.git synced 2025-07-19 12:59:36 +02:00

fix: add warning for interrupted download-countries command due to memory issues

This commit is contained in:
Sean Morley 2025-01-13 14:12:05 -05:00
parent f27453824a
commit a766a07ea1
2 changed files with 173 additions and 95 deletions

View file

@ -52,8 +52,13 @@ EOF
fi
# Sync the countries and world travel regions
# Sync the countries and world travel regions
python manage.py download-countries
if [ $? -eq 137 ]; then
>&2 echo "WARNING: The download-countries command was interrupted. This is likely due to lack of memory allocated to the container or the host. Please try again with more memory."
exit 1
fi
cat /code/adventurelog.txt

View file

@ -1,11 +1,9 @@
import os
import requests
import uuid
from django.core.management.base import BaseCommand
import requests
from worldtravel.models import Country, Region, City
from django.db import transaction
import ijson
from django.conf import settings
import psutil
def get_memory_usage():
@ -17,15 +15,22 @@ def log_memory_usage(stage):
memory_usage = get_memory_usage()
print(f"Memory usage at {stage}: {memory_usage / 1024 / 1024:.2f} MB")
from django.conf import settings
COUNTRY_REGION_JSON_VERSION = settings.COUNTRY_REGION_JSON_VERSION
media_root = settings.MEDIA_ROOT
def saveCountryFlag(country_code):
# For standards, use the lowercase country_code
country_code = country_code.lower()
flags_dir = os.path.join(media_root, 'flags')
# Check if the flags directory exists, if not, create it
if not os.path.exists(flags_dir):
os.makedirs(flags_dir)
# Check if the flag already exists in the media folder
flag_path = os.path.join(flags_dir, f'{country_code}.png')
if os.path.exists(flag_path):
print(f'Flag for {country_code} already exists')
@ -46,8 +51,8 @@ class Command(BaseCommand):
parser.add_argument('--force', action='store_true', help='Force download the countries+regions+states.json file')
def handle(self, **options):
log_memory_usage("start")
force = options['force']
batch_size = 250
countries_json_path = os.path.join(settings.MEDIA_ROOT, f'countries+regions+states-{COUNTRY_REGION_JSON_VERSION}.json')
if not os.path.exists(countries_json_path) or force:
res = requests.get(f'https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/{COUNTRY_REGION_JSON_VERSION}/json/countries%2Bstates%2Bcities.json')
@ -69,16 +74,30 @@ class Command(BaseCommand):
self.stdout.write(self.style.SUCCESS('Latest country, region, and state data already downloaded.'))
return
insert_id = uuid.uuid4()
with transaction.atomic():
# Process data in chunks using ijson
f = open(countries_json_path, 'rb')
parser = ijson.items(f, 'item')
existing_countries = {}
for country in Country.objects.iterator():
existing_countries[country.country_code] = country
existing_regions = {}
for region in Region.objects.iterator():
existing_regions[region.id] = region
existing_cities = {}
for city in City.objects.iterator():
existing_cities[city.id] = city
region_batch = []
city_batch = []
existing_region_ids = set()
existing_city_ids = set()
countries_to_create = []
regions_to_create = []
countries_to_update = []
regions_to_update = []
cities_to_create = []
cities_to_update = []
processed_country_codes = set()
processed_region_ids = set()
processed_city_ids = set()
for country in parser:
country_code = country['iso2']
@ -88,102 +107,156 @@ class Command(BaseCommand):
longitude = round(float(country['longitude']), 6) if country['longitude'] else None
latitude = round(float(country['latitude']), 6) if country['latitude'] else None
country_obj, _ = Country.objects.update_or_create(
processed_country_codes.add(country_code)
if country_code in existing_countries:
country_obj = existing_countries[country_code]
country_obj.name = country_name
country_obj.subregion = country_subregion
country_obj.capital = country_capital
country_obj.longitude = longitude
country_obj.latitude = latitude
countries_to_update.append(country_obj)
else:
country_obj = Country(
name=country_name,
country_code=country_code,
defaults={
'name': country_name,
'subregion': country_subregion,
'capital': country_capital,
'longitude': longitude,
'latitude': latitude,
'insert_id': insert_id
}
subregion=country_subregion,
capital=country_capital,
longitude=longitude,
latitude=latitude
)
countries_to_create.append(country_obj)
saveCountryFlag(country_code)
log_memory_usage(country_code)
# self.stdout.write(self.style.SUCCESS(f'Country {country_name} prepared'))
if country['states']:
for state in country['states']:
state_id = f"{country_code}-{state['state_code']}" if state['state_code'] else f"{country_code}-00"
name = state['name']
state_id = f"{country_code}-{state['state_code']}"
latitude = round(float(state['latitude']), 6) if state['latitude'] else None
longitude = round(float(state['longitude']), 6) if state['longitude'] else None
# Ensure no duplicate regions
if state_id not in existing_region_ids:
# Check for duplicate regions
if state_id in processed_region_ids:
self.stdout.write(self.style.ERROR(f'State {state_id} already processed'))
continue
log_memory_usage(f"State {state_id} processing")
processed_region_ids.add(state_id)
if state_id in existing_regions:
region_obj = existing_regions[state_id]
region_obj.name = name
region_obj.country = country_obj
region_obj.longitude = longitude
region_obj.latitude = latitude
regions_to_update.append(region_obj)
else:
region_obj = Region(
id=state_id,
name=state['name'],
name=name,
country=country_obj,
longitude=state['longitude'],
latitude=state['latitude'],
insert_id=insert_id
longitude=longitude,
latitude=latitude
)
region_batch.append(region_obj)
existing_region_ids.add(state_id)
log_memory_usage(state_id)
regions_to_create.append(region_obj)
# self.stdout.write(self.style.SUCCESS(f'State {state_id} prepared'))
# Handle cities and avoid duplicates
if 'cities' in state and len(state['cities']) > 0:
for city in state['cities']:
city_id = f"{state_id}-{city['id']}"
city_name = city['name']
latitude = round(float(city['latitude']), 6) if city['latitude'] else None
longitude = round(float(city['longitude']), 6) if city['longitude'] else None
if city_id not in existing_city_ids:
# Check for duplicate cities
if city_id in processed_city_ids:
self.stdout.write(self.style.ERROR(f'City {city_id} already processed'))
continue
processed_city_ids.add(city_id)
if city_id in existing_cities:
city_obj = existing_cities[city_id]
city_obj.name = city_name
city_obj.region = region_obj
city_obj.longitude = longitude
city_obj.latitude = latitude
cities_to_update.append(city_obj)
else:
city_obj = City(
id=city_id,
name=city['name'],
name=city_name,
region=region_obj,
longitude=city['longitude'],
latitude=city['latitude'],
insert_id=insert_id
longitude=longitude,
latitude=latitude
)
city_batch.append(city_obj)
existing_city_ids.add(city_id)
cities_to_create.append(city_obj)
# self.stdout.write(self.style.SUCCESS(f'City {city_id} prepared'))
# Bulk insert regions in smaller batches
if len(region_batch) >= 100:
Region.objects.bulk_create(
region_batch,
update_conflicts=True,
batch_size=100,
update_fields=['name', 'country', 'longitude', 'latitude', 'insert_id'],
unique_fields=['id']
else:
state_id = f"{country_code}-00"
processed_region_ids.add(state_id)
if state_id in existing_regions:
region_obj = existing_regions[state_id]
region_obj.name = country_name
region_obj.country = country_obj
regions_to_update.append(region_obj)
else:
region_obj = Region(
id=state_id,
name=country_name,
country=country_obj
)
region_batch.clear()
regions_to_create.append(region_obj)
# self.stdout.write(self.style.SUCCESS(f'Region {state_id} prepared for {country_name}'))
# Process in batches
for i in range(0, len(countries_to_create), batch_size):
batch = countries_to_create[i:i + batch_size]
Country.objects.bulk_create(batch)
self.stdout.write(self.style.SUCCESS(f'Processed countries batch {i//batch_size + 1}/{(len(countries_to_create)-1)//batch_size + 1}'))
log_memory_usage(f"Country batch {i//batch_size + 1}/{(len(countries_to_create)-1)//batch_size + 1}")
# Bulk insert cities in smaller batches
if len(city_batch) >= 100:
City.objects.bulk_create(
city_batch,
update_conflicts=True,
batch_size=100,
update_fields=['name', 'region', 'longitude', 'latitude', 'insert_id'],
unique_fields=['id']
)
city_batch.clear()
for i in range(0, len(regions_to_create), batch_size):
batch = regions_to_create[i:i + batch_size]
Region.objects.bulk_create(batch)
self.stdout.write(self.style.SUCCESS(f'Processed regions batch {i//batch_size + 1}/{(len(regions_to_create)-1)//batch_size + 1}'))
log_memory_usage(f"Region batch {i//batch_size + 1}/{(len(regions_to_create)-1)//batch_size + 1}")
# Final insertion of any remaining regions and cities
if region_batch:
Region.objects.bulk_create(
region_batch,
update_conflicts=True,
batch_size=100,
update_fields=['name', 'country', 'longitude', 'latitude', 'insert_id'],
unique_fields=['id']
)
for i in range(0, len(cities_to_create), batch_size):
batch = cities_to_create[i:i + batch_size]
City.objects.bulk_create(batch)
self.stdout.write(self.style.SUCCESS(f'Processed cities batch {i//batch_size + 1}/{(len(cities_to_create)-1)//batch_size + 1}'))
log_memory_usage(f"City batch {i//batch_size + 1}/{(len(cities_to_create)-1)//batch_size + 1}")
if city_batch:
City.objects.bulk_create(
city_batch,
update_conflicts=True,
batch_size=100,
update_fields=['name', 'region', 'longitude', 'latitude', 'insert_id'],
unique_fields=['id']
)
# Process updates in batches
for i in range(0, len(countries_to_update), batch_size):
batch = countries_to_update[i:i + batch_size]
Country.objects.bulk_update(batch, ['name', 'subregion', 'capital', 'longitude', 'latitude'])
self.stdout.write(self.style.SUCCESS(f'Updated countries batch {i//batch_size + 1}/{(len(countries_to_update)-1)//batch_size + 1}'))
log_memory_usage(f"Country update batch {i//batch_size + 1}/{(len(countries_to_update)-1)//batch_size + 1}")
self.stdout.write(self.style.SUCCESS('Regions and cities created'))
for i in range(0, len(regions_to_update), batch_size):
batch = regions_to_update[i:i + batch_size]
Region.objects.bulk_update(batch, ['name', 'country', 'longitude', 'latitude'])
self.stdout.write(self.style.SUCCESS(f'Updated regions batch {i//batch_size + 1}/{(len(regions_to_update)-1)//batch_size + 1}'))
log_memory_usage(f"Region update batch {i//batch_size + 1}/{(len(regions_to_update)-1)//batch_size + 1}")
# Clean up old data
Country.objects.exclude(insert_id=insert_id).delete()
Region.objects.exclude(insert_id=insert_id).delete()
City.objects.exclude(insert_id=insert_id).delete()
for i in range(0, len(cities_to_update), batch_size):
batch = cities_to_update[i:i + batch_size]
City.objects.bulk_update(batch, ['name', 'region', 'longitude', 'latitude'])
self.stdout.write(self.style.SUCCESS(f'Updated cities batch {i//batch_size + 1}/{(len(cities_to_update)-1)//batch_size + 1}'))
log_memory_usage(f"City update batch {i//batch_size + 1}/{(len(cities_to_update)-1)//batch_size + 1}")
self.stdout.write(self.style.SUCCESS('All data imported successfully and old data cleaned up'))
# Delete countries and regions that are no longer in the data
Country.objects.exclude(country_code__in=processed_country_codes).delete()
log_memory_usage("Extra Countries deleted")
Region.objects.exclude(id__in=processed_region_ids).delete()
log_memory_usage("Extra Regions deleted")
City.objects.exclude(id__in=processed_city_ids).delete()
log_memory_usage("Extra Cities deleted")
self.stdout.write(self.style.SUCCESS('All data imported successfully'))