From 29ea36cb2ec5558eb520c6a6f9d3b437fdcf7252 Mon Sep 17 00:00:00 2001 From: sanakhamassi Date: Wed, 12 Nov 2025 17:45:25 +0100 Subject: [PATCH 1/2] Adding formulas to json output --- grobid_client/format/TEI2LossyJSON.py | 158 +++++++++++++++----------- 1 file changed, 90 insertions(+), 68 deletions(-) diff --git a/grobid_client/format/TEI2LossyJSON.py b/grobid_client/format/TEI2LossyJSON.py index 76154ee..f437f72 100644 --- a/grobid_client/format/TEI2LossyJSON.py +++ b/grobid_client/format/TEI2LossyJSON.py @@ -66,6 +66,7 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO], stream: bool = False text_structure = [] document['body_text'] = text_structure figures_and_tables = [] + document['figures_and_tables'] = figures_and_tables references_structure = [] document['references'] = references_structure @@ -88,6 +89,7 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO], stream: bool = False ) for author in child.find_all("author") ] ) + ) doi_node = child.find("idno", type="DOI") @@ -206,7 +208,7 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO], stream: bool = False ] if graphic_coords else [] } ) - + # Extract references from listBibl with comprehensive processing list_bibl = soup.find("listBibl") if list_bibl: @@ -669,91 +671,110 @@ def _iter_passages_from_soup_for_text(self, text_node: Tag, passage_level: str) for passage in self._process_div_with_nested_content(div, passage_level, head_paragraph): yield passage + def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_paragraph: str = None) -> Iterator[Dict[str, Union[str, Dict[str, str]]]]: """ - Process a div and its nested content, handling various back section types. - Supports nested divs for complex back sections like annex with multiple subsections. + Process a div **in document order**. Paragraphs and formulas are yielded + exactly where they appear in the XML. """ head = div.find("head") - p_nodes = div.find_all("p") head_section = None current_head_paragraph = None - # Check if this div has nested divs first (handle namespace variants) - nested_divs = [] - for child in div.children: - if hasattr(child, 'name') and child.name: - # Handle both namespaced and non-namespaced divs - if child.name == "div" or child.name.endswith(":div"): - nested_divs.append(child) - - # Count only direct child paragraphs, not those in nested divs - direct_p_nodes = [child for child in div.children if hasattr(child, 'name') and child.name == "p"] - - if len(nested_divs) > 0 and len(direct_p_nodes) == 0: - # This is a container div - process each nested div independently - for nested_div in nested_divs: - # Skip references divs - if nested_div.get("type") == "references": - continue - # Pass None as head_paragraph to ensure nested divs use their own headers - for passage in self._process_div_with_nested_content(nested_div, passage_level, None): - yield passage - return # Don't process this div further - - # Determine the section header and content type for divs with content + # ------------------------------------------------------------------ + # 1. Determine the section header + # ------------------------------------------------------------------ if head: - if len(direct_p_nodes) == 0: - # This div has only a head, no paragraphs (standalone head) + if not div.find_all("p", recursive=False): # only head, no paragraphs current_head_paragraph = self._clean_text(head.get_text()) else: - # This div has both head and paragraphs - head is the section header head_section = self._clean_text(head.get_text()) else: - # If no head element, try to use the type attribute as head_section div_type = div.get("type") if div_type: - # Handle specific div types with appropriate section names - if div_type == "acknowledgement": - head_section = "Acknowledgements" - elif div_type == "conflict": - head_section = "Conflicts of Interest" - elif div_type == "contribution": - head_section = "Author Contributions" - elif div_type == "availability": - # Only set as default if this div has its own content - if len(direct_p_nodes) > 0: - head_section = "Data Availability" - elif div_type == "annex": - head_section = "Annex" - else: - # Generic handling - capitalize and format - head_section = div_type.replace("_", " ").title() - - # Process paragraphs in this div - if len(direct_p_nodes) > 0: - for id_p, p in enumerate(direct_p_nodes): + mapping = { + "acknowledgement": "Acknowledgements", + "conflict": "Conflicts of Interest", + "contribution": "Author Contributions", + "availability": "Data Availability", + "annex": "Annex", + } + head_section = mapping.get(div_type) or div_type.replace("_", " ").title() + + # ------------------------------------------------------------------ + # 2. Walk the direct children in order + # ------------------------------------------------------------------ + paragraph_id = None + for child in div.children: + if not hasattr(child, "name") or not child.name: + continue # skip NavigableString + + # ----- nested divs ------------------------------------------------ + if child.name in ("div",) or child.name.endswith(":div"): + # recurse – the nested div will use its own header + yield from self._process_div_with_nested_content( + child, passage_level, None + ) + continue + + # ----- paragraphs ------------------------------------------------- + if child.name == "p": paragraph_id = get_random_id(prefix="p_") - if passage_level == "sentence": - for id_s, sentence in enumerate(p.find_all("s")): - struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence) - if self.validate_refs: - for ref in struct['refs']: - assert "Wrong offsets", ref['offset_start'] < ref['offset_end'] - assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'] - yield struct + for sentence in child.find_all("s"): + yield get_formatted_passage( + current_head_paragraph or head_paragraph, + head_section, + paragraph_id, + sentence, + ) else: - struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, p) - if self.validate_refs: - for ref in struct['refs']: - assert "Wrong offsets", ref['offset_start'] < ref['offset_end'] - assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'] - yield struct - - # Update head_paragraph for potential next div + yield get_formatted_passage( + current_head_paragraph or head_paragraph, + head_section, + paragraph_id, + child, + ) + continue + + # ----- formulas --------------------------------------------------- + if child.name == "formula": + fid = ( + child.get("{http://www.w3.org/XML/1998/namespace}id") + or child.get("id") + or get_random_id("f_") + ) + raw = child.get_text(separator=" ", strip=True) + formula_text = self._clean_text(raw) + + # coordinates (if GROBID gave them) + coords = [] + if child.has_attr("coords"): + for c in child["coords"].split(";"): + if c.strip(): + coords.append(box_to_dict(c.split(","))) + + passage = { + "id": fid, + "text": formula_text, + "coords": coords, + "refs": [], + "type": "formula", + } + if current_head_paragraph or head_paragraph: + passage["head_paragraph"] = current_head_paragraph or head_paragraph + if head_section: + passage["head_section"] = head_section + + yield passage + continue + + # ------------------------------------------------------------------ + # 3. Propagate a possible standalone head for the next div + # ------------------------------------------------------------------ if current_head_paragraph is not None: - head_paragraph = current_head_paragraph + # (the caller will receive the updated value via the generator) + pass def process_directory(self, directory: Union[str, Path], pattern: str = "*.tei.xml", parallel: bool = True, workers: int = None) -> Iterator[Dict]: """Process a directory of TEI files and yield converted documents. @@ -1004,3 +1025,4 @@ def xml_table_to_json(table_element): def convert_tei_file(tei_file: Union[Path, BinaryIO], stream: bool = False): converter = TEI2LossyJSONConverter() return converter.convert_tei_file(tei_file, stream=stream) + From 574c58c38fda063ff02f878d926b6d27c1866597 Mon Sep 17 00:00:00 2001 From: sanakhamassi Date: Wed, 12 Nov 2025 19:27:44 +0100 Subject: [PATCH 2/2] Update json format to include formulas --- grobid_client/format/TEI2LossyJSON.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/grobid_client/format/TEI2LossyJSON.py b/grobid_client/format/TEI2LossyJSON.py index f437f72..77baf95 100644 --- a/grobid_client/format/TEI2LossyJSON.py +++ b/grobid_client/format/TEI2LossyJSON.py @@ -681,11 +681,8 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa head_section = None current_head_paragraph = None - # ------------------------------------------------------------------ - # 1. Determine the section header - # ------------------------------------------------------------------ if head: - if not div.find_all("p", recursive=False): # only head, no paragraphs + if not div.find_all("p", recursive=False): current_head_paragraph = self._clean_text(head.get_text()) else: head_section = self._clean_text(head.get_text()) @@ -701,15 +698,12 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa } head_section = mapping.get(div_type) or div_type.replace("_", " ").title() - # ------------------------------------------------------------------ - # 2. Walk the direct children in order - # ------------------------------------------------------------------ paragraph_id = None for child in div.children: if not hasattr(child, "name") or not child.name: - continue # skip NavigableString + continue - # ----- nested divs ------------------------------------------------ + # nested divs if child.name in ("div",) or child.name.endswith(":div"): # recurse – the nested div will use its own header yield from self._process_div_with_nested_content( @@ -717,7 +711,7 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa ) continue - # ----- paragraphs ------------------------------------------------- + # paragraphs if child.name == "p": paragraph_id = get_random_id(prefix="p_") if passage_level == "sentence": @@ -737,7 +731,7 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa ) continue - # ----- formulas --------------------------------------------------- + # formulas if child.name == "formula": fid = ( child.get("{http://www.w3.org/XML/1998/namespace}id") @@ -769,11 +763,8 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa yield passage continue - # ------------------------------------------------------------------ - # 3. Propagate a possible standalone head for the next div - # ------------------------------------------------------------------ + if current_head_paragraph is not None: - # (the caller will receive the updated value via the generator) pass def process_directory(self, directory: Union[str, Path], pattern: str = "*.tei.xml", parallel: bool = True, workers: int = None) -> Iterator[Dict]: