diff --git a/location/STATUS_FIELD.md b/location/STATUS_FIELD.md new file mode 100644 index 00000000..0139db50 --- /dev/null +++ b/location/STATUS_FIELD.md @@ -0,0 +1,136 @@ +# Location Data Normalization - Status Field + +## Overview + +The location models (Country, State, City) now include a `status` field to track the processing state of location data. This implements a canonical data architecture that ensures data quality and traceability. + +## Status Values + +The `status` field can have one of the following values: + +| Status | Description | +|--------|-------------| +| **RAW** | Raw data, no processing. Default value for new records. | +| **CLEANED** | Pre-cleaned data. HTML removed, spaces normalized. | +| **MATCHED** | Matched to a canonical record from reference databases. | +| **VERIFIED** | Officially validated against authoritative sources. | +| **REJECTED** | Invalid or unresolvable data that cannot be matched. | + +## Data Cleaning + +Each model now includes a `clean_data()` class method for pre-cleaning operations: + +### City.clean_data(name) +Removes HTML tags and normalizes spaces in city names. + +```python +cleaned_name = City.clean_data("

São Paulo

") +# Returns: "São Paulo" +``` + +### State.clean_data(name, acronym) +Removes HTML tags and normalizes spaces in state names and acronyms. + +```python +cleaned_name, cleaned_acronym = State.clean_data("São Paulo", "SP") +# Returns: ("São Paulo", "SP") +``` + +### Country.clean_data(name, acronym, acron3) +Removes HTML tags and normalizes spaces in country names and acronyms. + +```python +cleaned_name, cleaned_acronym, cleaned_acron3 = Country.clean_data( + "Brazil", + "BR", + "BRA" +) +# Returns: ("Brazil", "BR", "BRA") +``` + +## Usage Example + +### Creating records with status + +```python +from django.contrib.auth import get_user_model + +User = get_user_model() +user = User.objects.first() + +# Create a city with RAW status (default) +city = City.create(user=user, name="São Paulo") +# city.status == "RAW" + +# Create a city with VERIFIED status +verified_city = City.create(user=user, name="Rio de Janeiro", status="VERIFIED") +# verified_city.status == "VERIFIED" +``` + +### Cleaning data before creation + +```python +# Dirty data from external source +dirty_name = "

São Paulo City

" + +# Clean the data +cleaned_name = City.clean_data(dirty_name) +# cleaned_name == "São Paulo City" + +# Create with CLEANED status +city = City.create(user=user, name=cleaned_name, status="CLEANED") +``` + +## Workflow + +The typical workflow for location data is: + +1. **RAW** → Data is initially created/imported in raw form +2. **CLEANED** → HTML is removed, spaces normalized +3. **MATCHED** → Data is matched to canonical reference (e.g., GeoNames) +4. **VERIFIED** → Data is validated against authoritative source +5. **REJECTED** → Data that cannot be verified or matched + +## Database Migration + +The status field was added via migration `0004_add_status_field.py`: +- Adds nullable `status` field to City, State, and Country models +- Default value is "RAW" +- Max length: 10 characters +- Choices: RAW, CLEANED, MATCHED, VERIFIED, REJECTED + +## Testing + +Comprehensive tests have been added to verify: +- Default status is RAW +- Status can be set to any valid value +- `clean_data()` methods remove HTML +- `clean_data()` methods normalize spaces +- `clean_data()` methods handle None values + +Run tests with: +```bash +python manage.py test location +``` + +## Reference Data Sources + +The canonical location data should be sourced from: + +1. **Countries States Cities Database** + - GitHub: https://github.com/dr5hn/countries-states-cities-database + - Comprehensive database of countries, states, and cities + +2. **GeoNames** + - Website: https://www.geonames.org/ + - Official geographical database + +## Future Enhancements + +Potential improvements for the location normalization system: + +1. Add methods to transition between states +2. Implement automatic matching against reference databases +3. Add validation rules for each status transition +4. Create admin views to bulk-update status +5. Add logging/audit trail for status changes diff --git a/location/choices.py b/location/choices.py index 21f53c65..e68470ae 100755 --- a/location/choices.py +++ b/location/choices.py @@ -6,3 +6,12 @@ ("Sudeste", "Sudeste"), ("Sul", "Sul"), ) + +# Processing status for canonical location data +LOCATION_STATUS = ( + ("RAW", "RAW"), # Raw data, no processing + ("CLEANED", "CLEANED"), # Pre-cleaned data + ("MATCHED", "MATCHED"), # Matched to canonical record + ("VERIFIED", "VERIFIED"), # Officially validated + ("REJECTED", "REJECTED"), # Invalid or unresolvable +) diff --git a/location/migrations/0004_add_status_field.py b/location/migrations/0004_add_status_field.py new file mode 100644 index 00000000..b1546f1c --- /dev/null +++ b/location/migrations/0004_add_status_field.py @@ -0,0 +1,65 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('location', '0003_alter_city_options_alter_country_options_and_more'), + ] + + operations = [ + migrations.AddField( + model_name='city', + name='status', + field=models.CharField( + blank=True, + choices=[ + ('RAW', 'RAW'), + ('CLEANED', 'CLEANED'), + ('MATCHED', 'MATCHED'), + ('VERIFIED', 'VERIFIED'), + ('REJECTED', 'REJECTED') + ], + default='RAW', + max_length=10, + null=True, + verbose_name='Status' + ), + ), + migrations.AddField( + model_name='state', + name='status', + field=models.CharField( + blank=True, + choices=[ + ('RAW', 'RAW'), + ('CLEANED', 'CLEANED'), + ('MATCHED', 'MATCHED'), + ('VERIFIED', 'VERIFIED'), + ('REJECTED', 'REJECTED') + ], + default='RAW', + max_length=10, + null=True, + verbose_name='Status' + ), + ), + migrations.AddField( + model_name='country', + name='status', + field=models.CharField( + blank=True, + choices=[ + ('RAW', 'RAW'), + ('CLEANED', 'CLEANED'), + ('MATCHED', 'MATCHED'), + ('VERIFIED', 'VERIFIED'), + ('REJECTED', 'REJECTED') + ], + default='RAW', + max_length=10, + null=True, + verbose_name='Status' + ), + ), + ] diff --git a/location/models.py b/location/models.py index 015e84b2..ef549229 100755 --- a/location/models.py +++ b/location/models.py @@ -4,6 +4,7 @@ from django.db import models, IntegrityError from django.db.models import Q +from django.utils.html import strip_tags from django.utils.translation import gettext_lazy as _ from modelcluster.fields import ParentalKey from modelcluster.models import ClusterableModel @@ -15,6 +16,27 @@ from core.forms import CoreAdminModelForm from core.models import CommonControlField, Language, TextWithLang from core.utils.standardizer import standardize_name, standardize_code_and_name, remove_extra_spaces +from .choices import LOCATION_STATUS + + +def clean_text_data(text): + """ + Utility function to clean location text data. + Removes HTML tags and normalizes whitespace. + + Args: + text: The text to clean, can be None + + Returns: + Cleaned text or None if input was None + """ + if not text: + return text + # Remove HTML tags + cleaned = strip_tags(text) + # Remove extra spaces + cleaned = remove_extra_spaces(cleaned) + return cleaned class City(CommonControlField): @@ -23,12 +45,21 @@ class City(CommonControlField): Fields: name + status: Processing status (RAW, CLEANED, MATCHED, VERIFIED, REJECTED) """ name = models.TextField(_("Name of the city"), unique=True) + status = models.CharField( + _("Status"), + max_length=10, + choices=LOCATION_STATUS, + default="RAW", + blank=True, + null=True, + ) base_form_class = CoreAdminModelForm - panels = [FieldPanel("name")] + panels = [FieldPanel("name"), FieldPanel("status")] autocomplete_search_field = "name" def autocomplete_label(self): @@ -59,11 +90,11 @@ def load(cls, user, file_path=None): logging.exception(e) @classmethod - def get_or_create(cls, user=None, name=None): + def get_or_create(cls, user=None, name=None, status="RAW"): try: return cls.get(name) except cls.DoesNotExist: - return cls.create(user, name) + return cls.create(user, name, status) @classmethod def get(cls, name): @@ -76,19 +107,27 @@ def get(cls, name): return cls.objects.filter(name__iexact=name).first() @classmethod - def create(cls, user=None, name=None): + def create(cls, user=None, name=None, status="RAW"): name = remove_extra_spaces(name) if not name: raise ValueError("City.get_or_create requires name") try: city = City() city.name = name + city.status = status city.creator = user city.save() return city except IntegrityError: return cls.get(name) + @classmethod + def clean_data(cls, name): + """ + Pre-clean city name data: remove HTML, extra spaces, etc. + """ + return clean_text_data(name) + @staticmethod def standardize(text, user=None): """ @@ -110,13 +149,22 @@ class State(CommonControlField): Fields: name acronym + status: Processing status (RAW, CLEANED, MATCHED, VERIFIED, REJECTED) """ name = models.TextField(_("State name"), null=True, blank=True) acronym = models.CharField(_("State Acronym"), max_length=2, null=True, blank=True) + status = models.CharField( + _("Status"), + max_length=10, + choices=LOCATION_STATUS, + default="RAW", + blank=True, + null=True, + ) base_form_class = CoreAdminModelForm - panels = [FieldPanel("name"), FieldPanel("acronym")] + panels = [FieldPanel("name"), FieldPanel("acronym"), FieldPanel("status")] @staticmethod def autocomplete_custom_queryset_filter(search_term): @@ -166,8 +214,8 @@ def load(cls, user, file_path=None): ) @classmethod - def get_or_create(cls, user=None, name=None, acronym=None): - return cls.create_or_update(user, name=name, acronym=acronym) + def get_or_create(cls, user=None, name=None, acronym=None, status=None): + return cls.create_or_update(user, name=name, acronym=acronym, status=status) @classmethod def get(cls, name=None, acronym=None): @@ -181,7 +229,7 @@ def get(cls, name=None, acronym=None): raise ValueError("State.get requires name or acronym") @classmethod - def create(cls, user, name=None, acronym=None): + def create(cls, user, name=None, acronym=None, status="RAW"): name = remove_extra_spaces(name) acronym = remove_extra_spaces(acronym) if name or acronym: @@ -189,6 +237,7 @@ def create(cls, user, name=None, acronym=None): obj = cls() obj.name = name obj.acronym = acronym + obj.status = status obj.creator = user obj.save() return obj @@ -197,7 +246,17 @@ def create(cls, user, name=None, acronym=None): raise ValueError("State.create requires name or acronym") @classmethod - def create_or_update(cls, user, name=None, acronym=None): + def clean_data(cls, name=None, acronym=None): + """ + Pre-clean state data: remove HTML, extra spaces, etc. + Returns tuple (cleaned_name, cleaned_acronym) + """ + cleaned_name = clean_text_data(name) + cleaned_acronym = clean_text_data(acronym) + return cleaned_name, cleaned_acronym + + @classmethod + def create_or_update(cls, user, name=None, acronym=None, status=None): name = remove_extra_spaces(name) acronym = remove_extra_spaces(acronym) try: @@ -205,9 +264,11 @@ def create_or_update(cls, user, name=None, acronym=None): obj.updated_by = user obj.name = name or obj.name obj.acronym = acronym or obj.acronym + if status is not None: + obj.status = status obj.save() except cls.DoesNotExist: - obj = cls.create(user, name, acronym) + obj = cls.create(user, name, acronym, status or "RAW") return obj @staticmethod @@ -332,6 +393,8 @@ class Country(CommonControlField, ClusterableModel): Fields: name acronym + acron3 + status: Processing status (RAW, CLEANED, MATCHED, VERIFIED, REJECTED) """ name = models.CharField(_("Country Name"), blank=True, null=True, max_length=255) @@ -341,12 +404,21 @@ class Country(CommonControlField, ClusterableModel): acron3 = models.CharField( _("Country Acronym (3 char)"), blank=True, null=True, max_length=3 ) + status = models.CharField( + _("Status"), + max_length=10, + choices=LOCATION_STATUS, + default="RAW", + blank=True, + null=True, + ) base_form_class = CoreAdminModelForm panels = [ FieldPanel("name"), FieldPanel("acronym"), FieldPanel("acron3"), + FieldPanel("status"), InlinePanel("country_name", label=_("Country names")), ] @@ -433,6 +505,7 @@ def create_or_update( acron3=None, country_names=None, lang_code2=None, + status=None, ): name = remove_extra_spaces(name) acronym = remove_extra_spaces(acronym) @@ -442,9 +515,12 @@ def create_or_update( try: obj = cls.get(name, acronym, acron3) obj.updated_by = user + if status is not None: + obj.status = status except cls.DoesNotExist: obj = cls() obj.creator = user + obj.status = status or "RAW" obj.name = name or obj.name obj.acronym = acronym or obj.acronym @@ -463,6 +539,17 @@ def create_or_update( ) return obj + @classmethod + def clean_data(cls, name=None, acronym=None, acron3=None): + """ + Pre-clean country data: remove HTML, extra spaces, etc. + Returns tuple (cleaned_name, cleaned_acronym, cleaned_acron3) + """ + cleaned_name = clean_text_data(name) + cleaned_acronym = clean_text_data(acronym) + cleaned_acron3 = clean_text_data(acron3) + return cleaned_name, cleaned_acronym, cleaned_acron3 + @staticmethod def standardize(text, user=None): """ diff --git a/location/tests.py b/location/tests.py index df02b59d..ccb846bb 100755 --- a/location/tests.py +++ b/location/tests.py @@ -207,3 +207,169 @@ def test_standardize_returns_dict_original(self): self.assertIsInstance(item["country"], dict) self.assertEqual("BR, MX, Chile", item["country"].get("name")) self.assertEqual(None, item["country"].get("code")) + + +class CityStatusTest(TestCase): + def test_city_default_status_is_raw(self): + """Test that City objects have RAW status by default""" + user, created = User.objects.get_or_create(username="adm") + city = models.City.create(user=user, name="Test City") + self.assertEqual(city.status, "RAW") + + def test_city_status_can_be_set(self): + """Test that City status can be set to different values""" + user, created = User.objects.get_or_create(username="adm") + city = models.City.create(user=user, name="Verified City", status="VERIFIED") + self.assertEqual(city.status, "VERIFIED") + + def test_city_clean_data_removes_html(self): + """Test that clean_data removes HTML tags""" + dirty_name = "

City Name

" + cleaned = models.City.clean_data(dirty_name) + self.assertEqual(cleaned, "City Name") + + def test_city_clean_data_removes_extra_spaces(self): + """Test that clean_data removes extra spaces""" + dirty_name = "City Name with spaces" + cleaned = models.City.clean_data(dirty_name) + self.assertEqual(cleaned, "City Name with spaces") + + def test_city_clean_data_handles_none(self): + """Test that clean_data handles None values""" + cleaned = models.City.clean_data(None) + self.assertIsNone(cleaned) + + def test_city_get_or_create_with_status(self): + """Test that get_or_create accepts status parameter""" + user, created = User.objects.get_or_create(username="adm") + city = models.City.get_or_create(user=user, name="Test City", status="CLEANED") + self.assertEqual(city.status, "CLEANED") + + +class StateStatusTest(TestCase): + def test_state_default_status_is_raw(self): + """Test that State objects have RAW status by default""" + user, created = User.objects.get_or_create(username="adm") + state = models.State.create(user=user, name="Test State", acronym="TS") + self.assertEqual(state.status, "RAW") + + def test_state_status_can_be_set(self): + """Test that State status can be set to different values""" + user, created = User.objects.get_or_create(username="adm") + state = models.State.create(user=user, name="Verified State", acronym="VS", status="VERIFIED") + self.assertEqual(state.status, "VERIFIED") + + def test_state_create_or_update_sets_status_on_new(self): + """Test that create_or_update sets status on new objects""" + user, created = User.objects.get_or_create(username="adm") + state = models.State.create_or_update(user=user, name="New State", acronym="NS", status="CLEANED") + self.assertEqual(state.status, "CLEANED") + + def test_state_create_or_update_updates_status_on_existing(self): + """Test that create_or_update updates status on existing objects""" + user, created = User.objects.get_or_create(username="adm") + state = models.State.create_or_update(user=user, name="Existing State", acronym="ES", status="RAW") + self.assertEqual(state.status, "RAW") + # Update the same state with a new status + state = models.State.create_or_update(user=user, name="Existing State", acronym="ES", status="VERIFIED") + self.assertEqual(state.status, "VERIFIED") + + def test_state_clean_data_removes_html(self): + """Test that clean_data removes HTML tags from name and acronym""" + dirty_name = "State Name" + dirty_acronym = "ST" + cleaned_name, cleaned_acronym = models.State.clean_data(dirty_name, dirty_acronym) + self.assertEqual(cleaned_name, "State Name") + self.assertEqual(cleaned_acronym, "ST") + + def test_state_clean_data_removes_extra_spaces(self): + """Test that clean_data removes extra spaces""" + dirty_name = "State Name" + dirty_acronym = "SP " + cleaned_name, cleaned_acronym = models.State.clean_data(dirty_name, dirty_acronym) + self.assertEqual(cleaned_name, "State Name") + self.assertEqual(cleaned_acronym, "SP") + + def test_state_clean_data_handles_none(self): + """Test that clean_data handles None values""" + cleaned_name, cleaned_acronym = models.State.clean_data(None, None) + self.assertIsNone(cleaned_name) + self.assertIsNone(cleaned_acronym) + + def test_state_get_or_create_with_status(self): + """Test that get_or_create accepts status parameter""" + user, created = User.objects.get_or_create(username="adm") + state = models.State.get_or_create(user=user, name="Test State", acronym="TS", status="MATCHED") + self.assertEqual(state.status, "MATCHED") + + +class CountryStatusTest(TestCase): + def test_country_default_status_is_raw(self): + """Test that Country objects have RAW status by default""" + user, created = User.objects.get_or_create(username="adm") + country = models.Country.create_or_update( + user=user, name="Test Country", acronym="TC" + ) + self.assertEqual(country.status, "RAW") + + def test_country_status_can_be_set(self): + """Test that Country status can be set to different values""" + user, created = User.objects.get_or_create(username="adm") + country = models.Country.create_or_update( + user=user, name="Verified Country", acronym="VC", status="VERIFIED" + ) + self.assertEqual(country.status, "VERIFIED") + + def test_country_create_or_update_sets_status_on_new(self): + """Test that create_or_update sets status on new objects""" + user, created = User.objects.get_or_create(username="adm") + country = models.Country.create_or_update( + user=user, name="New Country", acronym="NC", status="CLEANED" + ) + self.assertEqual(country.status, "CLEANED") + + def test_country_create_or_update_updates_status_on_existing(self): + """Test that create_or_update updates status on existing objects""" + user, created = User.objects.get_or_create(username="adm") + country = models.Country.create_or_update( + user=user, name="Existing Country", acronym="EC", status="RAW" + ) + self.assertEqual(country.status, "RAW") + # Update the same country with a new status + country = models.Country.create_or_update( + user=user, name="Existing Country", acronym="EC", status="MATCHED" + ) + self.assertEqual(country.status, "MATCHED") + + def test_country_clean_data_removes_html(self): + """Test that clean_data removes HTML tags""" + dirty_name = "Country Name" + dirty_acronym = "CN" + dirty_acron3 = "CNT" + cleaned_name, cleaned_acronym, cleaned_acron3 = models.Country.clean_data( + dirty_name, dirty_acronym, dirty_acron3 + ) + self.assertEqual(cleaned_name, "Country Name") + self.assertEqual(cleaned_acronym, "CN") + self.assertEqual(cleaned_acron3, "CNT") + + def test_country_clean_data_removes_extra_spaces(self): + """Test that clean_data removes extra spaces""" + dirty_name = "Country Name" + dirty_acronym = "CN " + dirty_acron3 = "CNT " + cleaned_name, cleaned_acronym, cleaned_acron3 = models.Country.clean_data( + dirty_name, dirty_acronym, dirty_acron3 + ) + self.assertEqual(cleaned_name, "Country Name") + self.assertEqual(cleaned_acronym, "CN") + self.assertEqual(cleaned_acron3, "CNT") + + def test_country_clean_data_handles_none(self): + """Test that clean_data handles None values""" + cleaned_name, cleaned_acronym, cleaned_acron3 = models.Country.clean_data( + None, None, None + ) + self.assertIsNone(cleaned_name) + self.assertIsNone(cleaned_acronym) + self.assertIsNone(cleaned_acron3)