diff --git a/backend/server/worldtravel/management/commands/download-countries.py b/backend/server/worldtravel/management/commands/download-countries.py index 97ffe3d..4556697 100644 --- a/backend/server/worldtravel/management/commands/download-countries.py +++ b/backend/server/worldtravel/management/commands/download-countries.py @@ -1,26 +1,31 @@ import os -from django.core.management.base import BaseCommand import requests +import uuid +from django.core.management.base import BaseCommand from worldtravel.models import Country, Region, City from django.db import transaction import ijson - from django.conf import settings +import psutil + +def get_memory_usage(): + process = psutil.Process(os.getpid()) + memory_info = process.memory_info() + return memory_info.rss # in bytes + +def log_memory_usage(stage): + memory_usage = get_memory_usage() + print(f"Memory usage at {stage}: {memory_usage / 1024 / 1024:.2f} MB") COUNTRY_REGION_JSON_VERSION = settings.COUNTRY_REGION_JSON_VERSION - media_root = settings.MEDIA_ROOT def saveCountryFlag(country_code): - # For standards, use the lowercase country_code country_code = country_code.lower() flags_dir = os.path.join(media_root, 'flags') - - # Check if the flags directory exists, if not, create it if not os.path.exists(flags_dir): os.makedirs(flags_dir) - # Check if the flag already exists in the media folder flag_path = os.path.join(flags_dir, f'{country_code}.png') if os.path.exists(flag_path): print(f'Flag for {country_code} already exists') @@ -41,8 +46,8 @@ class Command(BaseCommand): parser.add_argument('--force', action='store_true', help='Force download the countries+regions+states.json file') def handle(self, **options): + log_memory_usage("start") force = options['force'] - batch_size = 250 countries_json_path = os.path.join(settings.MEDIA_ROOT, f'countries+regions+states-{COUNTRY_REGION_JSON_VERSION}.json') if not os.path.exists(countries_json_path) or force: res = requests.get(f'https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/{COUNTRY_REGION_JSON_VERSION}/json/countries%2Bstates%2Bcities.json') @@ -63,25 +68,17 @@ class Command(BaseCommand): else: self.stdout.write(self.style.SUCCESS('Latest country, region, and state data already downloaded.')) return + + insert_id = uuid.uuid4() with transaction.atomic(): - # Process data in chunks using ijson f = open(countries_json_path, 'rb') parser = ijson.items(f, 'item') - existing_countries = {country.country_code: country for country in Country.objects.all()} - existing_regions = {region.id: region for region in Region.objects.all()} - existing_cities = {city.id: city for city in City.objects.all()} - countries_to_create = [] - regions_to_create = [] - countries_to_update = [] - regions_to_update = [] - cities_to_create = [] - cities_to_update = [] - - processed_country_codes = set() - processed_region_ids = set() - processed_city_ids = set() + region_batch = [] + city_batch = [] + existing_region_ids = set() + existing_city_ids = set() for country in parser: country_code = country['iso2'] @@ -91,145 +88,102 @@ class Command(BaseCommand): longitude = round(float(country['longitude']), 6) if country['longitude'] else None latitude = round(float(country['latitude']), 6) if country['latitude'] else None - processed_country_codes.add(country_code) - - if country_code in existing_countries: - country_obj = existing_countries[country_code] - country_obj.name = country_name - country_obj.subregion = country_subregion - country_obj.capital = country_capital - country_obj.longitude = longitude - country_obj.latitude = latitude - countries_to_update.append(country_obj) - else: - country_obj = Country( - name=country_name, - country_code=country_code, - subregion=country_subregion, - capital=country_capital, - longitude=longitude, - latitude=latitude - ) - countries_to_create.append(country_obj) + country_obj, _ = Country.objects.update_or_create( + country_code=country_code, + defaults={ + 'name': country_name, + 'subregion': country_subregion, + 'capital': country_capital, + 'longitude': longitude, + 'latitude': latitude, + 'insert_id': insert_id + } + ) saveCountryFlag(country_code) - # self.stdout.write(self.style.SUCCESS(f'Country {country_name} prepared')) + log_memory_usage(country_code) - if country['states']: - for state in country['states']: - name = state['name'] - state_id = f"{country_code}-{state['state_code']}" - latitude = round(float(state['latitude']), 6) if state['latitude'] else None - longitude = round(float(state['longitude']), 6) if state['longitude'] else None - - # Check for duplicate regions - if state_id in processed_region_ids: - self.stdout.write(self.style.ERROR(f'State {state_id} already processed')) - continue - - processed_region_ids.add(state_id) - - if state_id in existing_regions: - region_obj = existing_regions[state_id] - region_obj.name = name - region_obj.country = country_obj - region_obj.longitude = longitude - region_obj.latitude = latitude - regions_to_update.append(region_obj) - else: - region_obj = Region( - id=state_id, - name=name, - country=country_obj, - longitude=longitude, - latitude=latitude - ) - regions_to_create.append(region_obj) - # self.stdout.write(self.style.SUCCESS(f'State {state_id} prepared')) - - if 'cities' in state and len(state['cities']) > 0: - for city in state['cities']: - city_id = f"{state_id}-{city['id']}" - city_name = city['name'] - latitude = round(float(city['latitude']), 6) if city['latitude'] else None - longitude = round(float(city['longitude']), 6) if city['longitude'] else None - - # Check for duplicate cities - if city_id in processed_city_ids: - self.stdout.write(self.style.ERROR(f'City {city_id} already processed')) - continue - - processed_city_ids.add(city_id) - - if city_id in existing_cities: - city_obj = existing_cities[city_id] - city_obj.name = city_name - city_obj.region = region_obj - city_obj.longitude = longitude - city_obj.latitude = latitude - cities_to_update.append(city_obj) - else: - city_obj = City( - id=city_id, - name=city_name, - region=region_obj, - longitude=longitude, - latitude=latitude - ) - cities_to_create.append(city_obj) - # self.stdout.write(self.style.SUCCESS(f'City {city_id} prepared')) - - else: - state_id = f"{country_code}-00" - processed_region_ids.add(state_id) - if state_id in existing_regions: - region_obj = existing_regions[state_id] - region_obj.name = country_name - region_obj.country = country_obj - regions_to_update.append(region_obj) - else: + if country['states']: + for state in country['states']: + state_id = f"{country_code}-{state['state_code']}" if state['state_code'] else f"{country_code}-00" + + # Ensure no duplicate regions + if state_id not in existing_region_ids: region_obj = Region( id=state_id, - name=country_name, - country=country_obj + name=state['name'], + country=country_obj, + longitude=state['longitude'], + latitude=state['latitude'], + insert_id=insert_id ) - regions_to_create.append(region_obj) - # self.stdout.write(self.style.SUCCESS(f'Region {state_id} prepared for {country_name}')) - # Process in batches - for i in range(0, len(countries_to_create), batch_size): - batch = countries_to_create[i:i + batch_size] - Country.objects.bulk_create(batch) - self.stdout.write(self.style.SUCCESS(f'Processed countries batch {i//batch_size + 1}/{(len(countries_to_create)-1)//batch_size + 1}')) + region_batch.append(region_obj) + existing_region_ids.add(state_id) + log_memory_usage(state_id) - for i in range(0, len(regions_to_create), batch_size): - batch = regions_to_create[i:i + batch_size] - Region.objects.bulk_create(batch) - self.stdout.write(self.style.SUCCESS(f'Processed regions batch {i//batch_size + 1}/{(len(regions_to_create)-1)//batch_size + 1}')) + # Handle cities and avoid duplicates + if 'cities' in state and len(state['cities']) > 0: + for city in state['cities']: + city_id = f"{state_id}-{city['id']}" + + if city_id not in existing_city_ids: + city_obj = City( + id=city_id, + name=city['name'], + region=region_obj, + longitude=city['longitude'], + latitude=city['latitude'], + insert_id=insert_id + ) + city_batch.append(city_obj) + existing_city_ids.add(city_id) - for i in range(0, len(cities_to_create), batch_size): - batch = cities_to_create[i:i + batch_size] - City.objects.bulk_create(batch) - self.stdout.write(self.style.SUCCESS(f'Processed cities batch {i//batch_size + 1}/{(len(cities_to_create)-1)//batch_size + 1}')) + # Bulk insert regions in smaller batches + if len(region_batch) >= 100: + Region.objects.bulk_create( + region_batch, + update_conflicts=True, + batch_size=100, + update_fields=['name', 'country', 'longitude', 'latitude', 'insert_id'], + unique_fields=['id'] + ) + region_batch.clear() - # Process updates in batches - for i in range(0, len(countries_to_update), batch_size): - batch = countries_to_update[i:i + batch_size] - Country.objects.bulk_update(batch, ['name', 'subregion', 'capital', 'longitude', 'latitude']) - self.stdout.write(self.style.SUCCESS(f'Updated countries batch {i//batch_size + 1}/{(len(countries_to_update)-1)//batch_size + 1}')) + # Bulk insert cities in smaller batches + if len(city_batch) >= 100: + City.objects.bulk_create( + city_batch, + update_conflicts=True, + batch_size=100, + update_fields=['name', 'region', 'longitude', 'latitude', 'insert_id'], + unique_fields=['id'] + ) + city_batch.clear() - for i in range(0, len(regions_to_update), batch_size): - batch = regions_to_update[i:i + batch_size] - Region.objects.bulk_update(batch, ['name', 'country', 'longitude', 'latitude']) - self.stdout.write(self.style.SUCCESS(f'Updated regions batch {i//batch_size + 1}/{(len(regions_to_update)-1)//batch_size + 1}')) + # Final insertion of any remaining regions and cities + if region_batch: + Region.objects.bulk_create( + region_batch, + update_conflicts=True, + batch_size=100, + update_fields=['name', 'country', 'longitude', 'latitude', 'insert_id'], + unique_fields=['id'] + ) - for i in range(0, len(cities_to_update), batch_size): - batch = cities_to_update[i:i + batch_size] - City.objects.bulk_update(batch, ['name', 'region', 'longitude', 'latitude']) - self.stdout.write(self.style.SUCCESS(f'Updated cities batch {i//batch_size + 1}/{(len(cities_to_update)-1)//batch_size + 1}')) + if city_batch: + City.objects.bulk_create( + city_batch, + update_conflicts=True, + batch_size=100, + update_fields=['name', 'region', 'longitude', 'latitude', 'insert_id'], + unique_fields=['id'] + ) - # Delete countries and regions that are no longer in the data - Country.objects.exclude(country_code__in=processed_country_codes).delete() - Region.objects.exclude(id__in=processed_region_ids).delete() - City.objects.exclude(id__in=processed_city_ids).delete() + self.stdout.write(self.style.SUCCESS('Regions and cities created')) - self.stdout.write(self.style.SUCCESS('All data imported successfully')) \ No newline at end of file + # Clean up old data + Country.objects.exclude(insert_id=insert_id).delete() + Region.objects.exclude(insert_id=insert_id).delete() + City.objects.exclude(insert_id=insert_id).delete() + + self.stdout.write(self.style.SUCCESS('All data imported successfully and old data cleaned up')) \ No newline at end of file diff --git a/backend/server/worldtravel/migrations/0015_city_insert_id_country_insert_id_region_insert_id.py b/backend/server/worldtravel/migrations/0015_city_insert_id_country_insert_id_region_insert_id.py new file mode 100644 index 0000000..5d7223b --- /dev/null +++ b/backend/server/worldtravel/migrations/0015_city_insert_id_country_insert_id_region_insert_id.py @@ -0,0 +1,28 @@ +# Generated by Django 5.0.8 on 2025-01-13 17:50 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('worldtravel', '0014_alter_visitedcity_options'), + ] + + operations = [ + migrations.AddField( + model_name='city', + name='insert_id', + field=models.UUIDField(blank=True, null=True), + ), + migrations.AddField( + model_name='country', + name='insert_id', + field=models.UUIDField(blank=True, null=True), + ), + migrations.AddField( + model_name='region', + name='insert_id', + field=models.UUIDField(blank=True, null=True), + ), + ] diff --git a/backend/server/worldtravel/models.py b/backend/server/worldtravel/models.py index 9e83f59..6c7ebb8 100644 --- a/backend/server/worldtravel/models.py +++ b/backend/server/worldtravel/models.py @@ -17,6 +17,7 @@ class Country(models.Model): capital = models.CharField(max_length=100, blank=True, null=True) longitude = models.DecimalField(max_digits=9, decimal_places=6, null=True, blank=True) latitude = models.DecimalField(max_digits=9, decimal_places=6, null=True, blank=True) + insert_id = models.UUIDField(unique=False, blank=True, null=True) class Meta: verbose_name = "Country" @@ -31,6 +32,7 @@ class Region(models.Model): country = models.ForeignKey(Country, on_delete=models.CASCADE) longitude = models.DecimalField(max_digits=9, decimal_places=6, null=True, blank=True) latitude = models.DecimalField(max_digits=9, decimal_places=6, null=True, blank=True) + insert_id = models.UUIDField(unique=False, blank=True, null=True) def __str__(self): return self.name @@ -41,6 +43,7 @@ class City(models.Model): region = models.ForeignKey(Region, on_delete=models.CASCADE) longitude = models.DecimalField(max_digits=9, decimal_places=6, null=True, blank=True) latitude = models.DecimalField(max_digits=9, decimal_places=6, null=True, blank=True) + insert_id = models.UUIDField(unique=False, blank=True, null=True) class Meta: verbose_name_plural = "Cities"