1
0
Fork 0
mirror of https://github.com/seanmorley15/AdventureLog.git synced 2025-08-04 04:35:19 +02:00

fix: implement memory limit and adjust batch size for country downloads

This commit is contained in:
Sean Morley 2025-01-13 14:42:48 -05:00
parent a766a07ea1
commit 563373dd54

View file

@ -3,8 +3,16 @@ from django.core.management.base import BaseCommand
import requests import requests
from worldtravel.models import Country, Region, City from worldtravel.models import Country, Region, City
from django.db import transaction from django.db import transaction
import ijson
import psutil import psutil
import ijson
import resource
def limit_memory(max_memory):
soft, hard = resource.getrlimit(resource.RLIMIT_AS)
resource.setrlimit(resource.RLIMIT_AS, (max_memory, hard))
# Set memory limit to 800MB
limit_memory(800 * 1024 * 1024)
def get_memory_usage(): def get_memory_usage():
process = psutil.Process(os.getpid()) process = psutil.Process(os.getpid())
@ -52,7 +60,7 @@ class Command(BaseCommand):
def handle(self, **options): def handle(self, **options):
force = options['force'] force = options['force']
batch_size = 250 batch_size = 100
countries_json_path = os.path.join(settings.MEDIA_ROOT, f'countries+regions+states-{COUNTRY_REGION_JSON_VERSION}.json') countries_json_path = os.path.join(settings.MEDIA_ROOT, f'countries+regions+states-{COUNTRY_REGION_JSON_VERSION}.json')
if not os.path.exists(countries_json_path) or force: if not os.path.exists(countries_json_path) or force:
res = requests.get(f'https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/{COUNTRY_REGION_JSON_VERSION}/json/countries%2Bstates%2Bcities.json') res = requests.get(f'https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/{COUNTRY_REGION_JSON_VERSION}/json/countries%2Bstates%2Bcities.json')
@ -74,19 +82,14 @@ class Command(BaseCommand):
self.stdout.write(self.style.SUCCESS('Latest country, region, and state data already downloaded.')) self.stdout.write(self.style.SUCCESS('Latest country, region, and state data already downloaded.'))
return return
with transaction.atomic(): with open(countries_json_path, 'r') as f:
# Process data in chunks using ijson
f = open(countries_json_path, 'rb') f = open(countries_json_path, 'rb')
parser = ijson.items(f, 'item') parser = ijson.items(f, 'item')
existing_countries = {}
for country in Country.objects.iterator(): with transaction.atomic():
existing_countries[country.country_code] = country existing_countries = {country.country_code: country for country in Country.objects.all()}
existing_regions = {} existing_regions = {region.id: region for region in Region.objects.all()}
for region in Region.objects.iterator(): existing_cities = {city.id: city for city in City.objects.all()}
existing_regions[region.id] = region
existing_cities = {}
for city in City.objects.iterator():
existing_cities[city.id] = city
countries_to_create = [] countries_to_create = []
regions_to_create = [] regions_to_create = []
@ -129,7 +132,6 @@ class Command(BaseCommand):
countries_to_create.append(country_obj) countries_to_create.append(country_obj)
saveCountryFlag(country_code) saveCountryFlag(country_code)
# self.stdout.write(self.style.SUCCESS(f'Country {country_name} prepared'))
if country['states']: if country['states']:
for state in country['states']: for state in country['states']:
@ -142,7 +144,6 @@ class Command(BaseCommand):
if state_id in processed_region_ids: if state_id in processed_region_ids:
self.stdout.write(self.style.ERROR(f'State {state_id} already processed')) self.stdout.write(self.style.ERROR(f'State {state_id} already processed'))
continue continue
log_memory_usage(f"State {state_id} processing")
processed_region_ids.add(state_id) processed_region_ids.add(state_id)
@ -163,6 +164,7 @@ class Command(BaseCommand):
) )
regions_to_create.append(region_obj) regions_to_create.append(region_obj)
# self.stdout.write(self.style.SUCCESS(f'State {state_id} prepared')) # self.stdout.write(self.style.SUCCESS(f'State {state_id} prepared'))
log_memory_usage('state')
if 'cities' in state and len(state['cities']) > 0: if 'cities' in state and len(state['cities']) > 0:
for city in state['cities']: for city in state['cities']:
@ -195,6 +197,7 @@ class Command(BaseCommand):
) )
cities_to_create.append(city_obj) cities_to_create.append(city_obj)
# self.stdout.write(self.style.SUCCESS(f'City {city_id} prepared')) # self.stdout.write(self.style.SUCCESS(f'City {city_id} prepared'))
log_memory_usage('city')
else: else:
state_id = f"{country_code}-00" state_id = f"{country_code}-00"
@ -217,46 +220,42 @@ class Command(BaseCommand):
batch = countries_to_create[i:i + batch_size] batch = countries_to_create[i:i + batch_size]
Country.objects.bulk_create(batch) Country.objects.bulk_create(batch)
self.stdout.write(self.style.SUCCESS(f'Processed countries batch {i//batch_size + 1}/{(len(countries_to_create)-1)//batch_size + 1}')) self.stdout.write(self.style.SUCCESS(f'Processed countries batch {i//batch_size + 1}/{(len(countries_to_create)-1)//batch_size + 1}'))
log_memory_usage(f"Country batch {i//batch_size + 1}/{(len(countries_to_create)-1)//batch_size + 1}") log_memory_usage('country')
for i in range(0, len(regions_to_create), batch_size): for i in range(0, len(regions_to_create), batch_size):
batch = regions_to_create[i:i + batch_size] batch = regions_to_create[i:i + batch_size]
Region.objects.bulk_create(batch) Region.objects.bulk_create(batch)
self.stdout.write(self.style.SUCCESS(f'Processed regions batch {i//batch_size + 1}/{(len(regions_to_create)-1)//batch_size + 1}')) self.stdout.write(self.style.SUCCESS(f'Processed regions batch {i//batch_size + 1}/{(len(regions_to_create)-1)//batch_size + 1}'))
log_memory_usage(f"Region batch {i//batch_size + 1}/{(len(regions_to_create)-1)//batch_size + 1}") log_memory_usage('region')
for i in range(0, len(cities_to_create), batch_size): for i in range(0, len(cities_to_create), batch_size):
batch = cities_to_create[i:i + batch_size] batch = cities_to_create[i:i + batch_size]
City.objects.bulk_create(batch) City.objects.bulk_create(batch)
self.stdout.write(self.style.SUCCESS(f'Processed cities batch {i//batch_size + 1}/{(len(cities_to_create)-1)//batch_size + 1}')) self.stdout.write(self.style.SUCCESS(f'Processed cities batch {i//batch_size + 1}/{(len(cities_to_create)-1)//batch_size + 1}'))
log_memory_usage(f"City batch {i//batch_size + 1}/{(len(cities_to_create)-1)//batch_size + 1}") log_memory_usage('city')
# Process updates in batches # Process updates in batches
for i in range(0, len(countries_to_update), batch_size): for i in range(0, len(countries_to_update), batch_size):
batch = countries_to_update[i:i + batch_size] batch = countries_to_update[i:i + batch_size]
Country.objects.bulk_update(batch, ['name', 'subregion', 'capital', 'longitude', 'latitude']) Country.objects.bulk_update(batch, ['name', 'subregion', 'capital', 'longitude', 'latitude'])
self.stdout.write(self.style.SUCCESS(f'Updated countries batch {i//batch_size + 1}/{(len(countries_to_update)-1)//batch_size + 1}')) self.stdout.write(self.style.SUCCESS(f'Updated countries batch {i//batch_size + 1}/{(len(countries_to_update)-1)//batch_size + 1}'))
log_memory_usage(f"Country update batch {i//batch_size + 1}/{(len(countries_to_update)-1)//batch_size + 1}") log_memory_usage('country')
for i in range(0, len(regions_to_update), batch_size): for i in range(0, len(regions_to_update), batch_size):
batch = regions_to_update[i:i + batch_size] batch = regions_to_update[i:i + batch_size]
Region.objects.bulk_update(batch, ['name', 'country', 'longitude', 'latitude']) Region.objects.bulk_update(batch, ['name', 'country', 'longitude', 'latitude'])
self.stdout.write(self.style.SUCCESS(f'Updated regions batch {i//batch_size + 1}/{(len(regions_to_update)-1)//batch_size + 1}')) self.stdout.write(self.style.SUCCESS(f'Updated regions batch {i//batch_size + 1}/{(len(regions_to_update)-1)//batch_size + 1}'))
log_memory_usage(f"Region update batch {i//batch_size + 1}/{(len(regions_to_update)-1)//batch_size + 1}") log_memory_usage('region')
for i in range(0, len(cities_to_update), batch_size): for i in range(0, len(cities_to_update), batch_size):
batch = cities_to_update[i:i + batch_size] batch = cities_to_update[i:i + batch_size]
City.objects.bulk_update(batch, ['name', 'region', 'longitude', 'latitude']) City.objects.bulk_update(batch, ['name', 'region', 'longitude', 'latitude'])
self.stdout.write(self.style.SUCCESS(f'Updated cities batch {i//batch_size + 1}/{(len(cities_to_update)-1)//batch_size + 1}')) self.stdout.write(self.style.SUCCESS(f'Updated cities batch {i//batch_size + 1}/{(len(cities_to_update)-1)//batch_size + 1}'))
log_memory_usage(f"City update batch {i//batch_size + 1}/{(len(cities_to_update)-1)//batch_size + 1}") log_memory_usage('city')
# Delete countries and regions that are no longer in the data # Delete countries and regions that are no longer in the data
Country.objects.exclude(country_code__in=processed_country_codes).delete() Country.objects.exclude(country_code__in=processed_country_codes).delete()
log_memory_usage("Extra Countries deleted")
Region.objects.exclude(id__in=processed_region_ids).delete() Region.objects.exclude(id__in=processed_region_ids).delete()
log_memory_usage("Extra Regions deleted")
City.objects.exclude(id__in=processed_city_ids).delete() City.objects.exclude(id__in=processed_city_ids).delete()
log_memory_usage("Extra Cities deleted")
self.stdout.write(self.style.SUCCESS('All data imported successfully')) self.stdout.write(self.style.SUCCESS('All data imported successfully'))