histcite

What is HistCite-Python?

HistCite-Python is a Python package for parsing scientific papers' references and recognize citation relathiship between them.

It's originated from the HistCite project, which is no longer maintained by Clarivate for some years. With pandas 2.0 and Graphviz, HistCite-Python has implemented the core functions of HistCite, and extended some new features.

  • Support multiple OS systems, Windows, Linux and Mac OS.
  • Support multiple literature database, Web of Science, Scopus, CSSCI.

HistCite-Python is an open source project, you can find the source code on GitHub. If you have any questions or suggestions, please submit an issue on GitHub.

Certainly, welcome to contribute to this project.

 1"""
 2## What is HistCite-Python?
 3HistCite-Python is a Python package for parsing scientific papers' references and recognize citation relathiship between them. 
 4
 5It's originated from the [HistCite project](https://support.clarivate.com/ScientificandAcademicResearch/s/article/HistCite-No-longer-in-active-development-or-officially-supported),
 6which is no longer maintained by Clarivate for some years. With pandas 2.0 and Graphviz, HistCite-Python has implemented the core functions of HistCite, and extended some new features.
 7
 8- Support multiple OS systems, Windows, Linux and Mac OS.
 9- Support multiple literature database, Web of Science, Scopus, CSSCI.
10
11HistCite-Python is an **open source** project, you can find the source code on [GitHub](https://github.com/doublessay/histcite-python).
12If you have any questions or suggestions, please submit an issue on GitHub. 
13
14Certainly, welcome to contribute to this project.
15"""
16
17__version__ = "0.5.2"
18
19from .compute_metrics import ComputeMetrics
20from .network_graph import GraphViz
21from .parse_reference import ParseReference
22from .process_file import ProcessFile
23from .read_file import (
24    ReadFile,
25    ReadWosFile,
26    ReadCssciFile,
27    ReadScopusFile,
28)
29from .recognize_reference import RecognizeReference
30
31__all__ = [
32    "ComputeMetrics",
33    "GraphViz",
34    "ParseReference",
35    "ProcessFile",
36    "ReadFile",
37    "ReadWosFile",
38    "ReadCssciFile",
39    "ReadScopusFile",
40    "RecognizeReference",
41]
class ComputeMetrics:
 17class ComputeMetrics:
 18    """Compute descriptive statistics of docs.
 19
 20    Attributes:
 21        merged_docs_df: DataFrame of docs merged with citation relationship.
 22        source_type: Source type of docs, `wos`, `cssci` or `scopus`.
 23
 24    """
 25
 26    def __init__(
 27        self,
 28        docs_df: pd.DataFrame,
 29        citation_relationship: pd.DataFrame,
 30        source_type: Literal["wos", "cssci", "scopus"],
 31    ):
 32        """
 33        Args:
 34            docs_df: DataFrame of docs.
 35            citation_relationship: DataFrame of citation relationship.
 36            source_type: Source type of docs, `wos`, `cssci` or `scopus`.
 37        """
 38        self.merged_docs_df: pd.DataFrame = docs_df.merge(
 39            citation_relationship[["doc_index", "LCR", "LCS"]], on="doc_index"
 40        )
 41        self.source_type: Literal["wos", "cssci", "scopus"] = source_type
 42
 43    @staticmethod
 44    def generate_df_factory(
 45        merged_docs_df: pd.DataFrame,
 46        use_cols: list[str],
 47        col: str,
 48        split_char: Optional[str] = None,
 49        str_lower: bool = False,
 50        sort_by_col: Literal["Recs", "TLCS", "TGCS"] = "Recs",
 51    ) -> pd.DataFrame:
 52        """A factory method to generate DataFrame of specific field.
 53        You can analyze any field besides the provided functions through this method.
 54
 55        Args:
 56            merged_docs_df: DataFrame of docs merged with citation relationship.
 57            use_cols: Columns to use, e.g. `["AU", "LCS", "TC"]`.
 58            col: Column to analyze, e.g. `AU`.
 59            split_char: Whether to split string, e.g. `; `. Defaults to `None`.
 60            str_lower: Whether to convert string to lowercase. Defaults to `False`.
 61            sort_by_col: Sort DataFrame by column, `Recs`, `TLCS` or `TGCS`. Defaults to `Recs`.
 62
 63        Returns:
 64            DataFrame of specific field.
 65        """
 66        assert col in use_cols, "Argument <col> must be in use_cols"
 67        if sort_by_col == "TLCS":
 68            assert "LCS" in use_cols, "LCS must be in use_cols before sorting by TLCS"
 69        elif sort_by_col == "TGCS":
 70            assert "TC" in use_cols, "TC must be in use_cols before sorting by TGCS"
 71
 72        df = merged_docs_df[use_cols]
 73        if split_char:
 74            df = df.dropna(subset=[col])
 75            df = df.astype({col: "str"})
 76            if str_lower:
 77                df[col] = df[col].str.lower()
 78            df[col] = df[col].str.split(split_char)
 79            df = df.explode(col)
 80            df = df.reset_index(drop=True)
 81
 82        if "LCS" in use_cols:
 83            if "TC" in use_cols:
 84                grouped_df = df.groupby(col).agg(
 85                    {col: "count", "LCS": "sum", "TC": "sum"}
 86                )
 87            else:
 88                grouped_df = df.groupby(col).agg({col: "count", "LCS": "sum"})
 89        else:
 90            grouped_df = df.groupby(col).agg({col: "count"})
 91
 92        grouped_df.rename(
 93            columns={col: "Recs", "LCS": "TLCS", "TC": "TGCS"}, inplace=True
 94        )
 95        # e.g. Andersson, Gerhard (7202645907)
 96        if col == "Author full names":
 97            grouped_df.index = grouped_df.index.str.replace(r" \(\d+\)", "", regex=True)
 98
 99        if not sort_by_col:
100            sort_by_col = "Recs"
101        return grouped_df.sort_values(sort_by_col, ascending=False)
102
103    def generate_records_df(self) -> pd.DataFrame:
104        """Return records DataFrame. Similar to `merged_docs_df`."""
105        if self.source_type in ["wos", "scopus"]:
106            use_cols = [
107                "AU",
108                "TI",
109                "SO",
110                "PY",
111                "TI",
112                "LCS",
113                "TC",
114                "LCR",
115                "NR",
116                "source file",
117            ]
118        elif self.source_type == "cssci":
119            use_cols = ["AU", "TI", "SO", "PY", "LCS", "LCR", "NR", "source file"]
120        else:
121            raise ValueError("Invalid source type")
122        records_df = self.merged_docs_df[use_cols]
123        if "TC" in use_cols:
124            records_df = records_df.rename(columns={"TC": "GCS"})
125        if "NR" in use_cols:
126            records_df = records_df.rename(columns={"NR": "GCR"})
127        return records_df
128
129    def generate_author_df(self) -> pd.DataFrame:
130        """Return author DataFrame."""
131        if self.source_type == "wos":
132            use_cols = ["AU", "LCS", "TC"]
133        elif self.source_type == "cssci":
134            use_cols = ["AU", "LCS"]
135        elif self.source_type == "scopus":
136            use_cols = ["Author full names", "LCS", "TC"]
137        else:
138            raise ValueError("Invalid source type")
139        return self.generate_df_factory(
140            self.merged_docs_df, use_cols, use_cols[0], "; "
141        )
142
143    def generate_keyword_df(self) -> pd.DataFrame:
144        """Return keyword DataFrame."""
145        if self.source_type in ["wos", "scopus"]:
146            use_cols = ["DE", "LCS", "TC"]
147        elif self.source_type == "cssci":
148            use_cols = ["DE", "LCS"]
149        else:
150            raise ValueError("Invalid source type")
151        return self.generate_df_factory(self.merged_docs_df, use_cols, "DE", "; ", True)
152
153    def generate_institution_df(self) -> pd.DataFrame:
154        """Return institution DataFrame. Not support Scopus."""
155        assert (
156            self.source_type != "scopus"
157        ), "Scopus is not supported to analyze institution field yet."
158        if self.source_type == "wos":
159            use_cols = ["C3", "LCS", "TC"]
160        elif self.source_type == "cssci":
161            use_cols = ["C3", "LCS"]
162        else:
163            raise ValueError("Invalid source type")
164        return self.generate_df_factory(self.merged_docs_df, use_cols, "C3", "; ")
165
166    def generate_journal_df(self) -> pd.DataFrame:
167        """Return journal DataFrame."""
168        if self.source_type in ["wos", "scopus"]:
169            use_cols = ["SO", "LCS", "TC"]
170        elif self.source_type == "cssci":
171            use_cols = ["SO", "LCS"]
172        else:
173            raise ValueError("Invalid source type")
174        return self.generate_df_factory(self.merged_docs_df, use_cols, "SO")
175
176    def generate_year_df(self) -> pd.DataFrame:
177        """Return publication year DataFrame. Sort by `PY` ascending."""
178        use_cols = ["PY"]
179        return self.generate_df_factory(
180            self.merged_docs_df, use_cols, "PY"
181        ).sort_values(by="PY")
182
183    def generate_document_type_df(self) -> pd.DataFrame:
184        """Return document type DataFrame. Not support CSSCI."""
185        assert self.source_type != "cssci", "CSSCI doesn't have document type info"
186        use_cols = ["DT"]
187        return self.generate_df_factory(self.merged_docs_df, use_cols, "DT")
188
189    # def generate_reference_df(self):
190    #     """Generate reference DataFrame. The `local` field means whether the reference is in the downloaded docs."""
191    #     assert self.refs_df is not None, "Param refs_df can't be None"
192    #     if self.source_type == "wos":
193    #         keys = ["First_AU", "PY", "J9", "VL", "BP", "DI", "local"]
194    #     elif self.source_type == "cssci":
195    #         keys = ["First_AU", "TI", "SO", "PY", "VL", "local"]
196    #     elif self.source_type == "scopus":
197    #         keys = ["First_AU", "TI", "SO", "VL", "BP", "EP", "PY", "local"]
198    #     else:
199    #         raise ValueError("Invalid source type")
200    #     refs_df = (
201    #         self.refs_df.groupby(by=keys, dropna=False).size().reset_index(name="Recs")
202    #     )
203    #     refs_df.insert(len(refs_df.columns) - 1, "local", refs_df.pop("local"))
204    #     return refs_df.sort_values(by="Recs", ascending=False)
205
206    def write2excel(self, save_path: str):
207        """Write all dataframes to an excel file. Each dataframe is a sheet.
208
209        Args:
210            save_path: The path to save the excel file.
211
212        Returns:
213            An excel file with multiple sheets.
214        """
215        save_folder_path = os.path.dirname(save_path)
216        if not os.path.exists(save_folder_path):
217            os.makedirs(save_folder_path)
218        with pd.ExcelWriter(save_path) as writer:
219            self.generate_records_df().to_excel(
220                writer, sheet_name="Records", index=False
221            )
222            self.generate_author_df().to_excel(writer, sheet_name="Authors")
223            self.generate_journal_df().to_excel(writer, sheet_name="Journals")
224            self.generate_keyword_df().to_excel(writer, sheet_name="Keywords")
225            self.generate_year_df().to_excel(writer, sheet_name="Years")
226
227            # if self.refs_df is not None:
228            #     self.generate_reference_df().to_excel(
229            #         writer, sheet_name="Cited References", index=False
230            #     )
231            if self.source_type in ["wos", "cssci"]:
232                self.generate_institution_df().to_excel(
233                    writer, sheet_name="Institutions"
234                )
235            if self.source_type in ["wos", "scopus"]:
236                self.generate_document_type_df().to_excel(
237                    writer, sheet_name="Document Type"
238                )

Compute descriptive statistics of docs.

Attributes:
  • merged_docs_df: DataFrame of docs merged with citation relationship.
  • source_type: Source type of docs, wos, cssci or scopus.
ComputeMetrics( docs_df: pandas.core.frame.DataFrame, citation_relationship: pandas.core.frame.DataFrame, source_type: Literal['wos', 'cssci', 'scopus'])
26    def __init__(
27        self,
28        docs_df: pd.DataFrame,
29        citation_relationship: pd.DataFrame,
30        source_type: Literal["wos", "cssci", "scopus"],
31    ):
32        """
33        Args:
34            docs_df: DataFrame of docs.
35            citation_relationship: DataFrame of citation relationship.
36            source_type: Source type of docs, `wos`, `cssci` or `scopus`.
37        """
38        self.merged_docs_df: pd.DataFrame = docs_df.merge(
39            citation_relationship[["doc_index", "LCR", "LCS"]], on="doc_index"
40        )
41        self.source_type: Literal["wos", "cssci", "scopus"] = source_type
Arguments:
  • docs_df: DataFrame of docs.
  • citation_relationship: DataFrame of citation relationship.
  • source_type: Source type of docs, wos, cssci or scopus.
merged_docs_df: pandas.core.frame.DataFrame
source_type: Literal['wos', 'cssci', 'scopus']
@staticmethod
def generate_df_factory( merged_docs_df: pandas.core.frame.DataFrame, use_cols: list[str], col: str, split_char: Optional[str] = None, str_lower: bool = False, sort_by_col: Literal['Recs', 'TLCS', 'TGCS'] = 'Recs') -> pandas.core.frame.DataFrame:
 43    @staticmethod
 44    def generate_df_factory(
 45        merged_docs_df: pd.DataFrame,
 46        use_cols: list[str],
 47        col: str,
 48        split_char: Optional[str] = None,
 49        str_lower: bool = False,
 50        sort_by_col: Literal["Recs", "TLCS", "TGCS"] = "Recs",
 51    ) -> pd.DataFrame:
 52        """A factory method to generate DataFrame of specific field.
 53        You can analyze any field besides the provided functions through this method.
 54
 55        Args:
 56            merged_docs_df: DataFrame of docs merged with citation relationship.
 57            use_cols: Columns to use, e.g. `["AU", "LCS", "TC"]`.
 58            col: Column to analyze, e.g. `AU`.
 59            split_char: Whether to split string, e.g. `; `. Defaults to `None`.
 60            str_lower: Whether to convert string to lowercase. Defaults to `False`.
 61            sort_by_col: Sort DataFrame by column, `Recs`, `TLCS` or `TGCS`. Defaults to `Recs`.
 62
 63        Returns:
 64            DataFrame of specific field.
 65        """
 66        assert col in use_cols, "Argument <col> must be in use_cols"
 67        if sort_by_col == "TLCS":
 68            assert "LCS" in use_cols, "LCS must be in use_cols before sorting by TLCS"
 69        elif sort_by_col == "TGCS":
 70            assert "TC" in use_cols, "TC must be in use_cols before sorting by TGCS"
 71
 72        df = merged_docs_df[use_cols]
 73        if split_char:
 74            df = df.dropna(subset=[col])
 75            df = df.astype({col: "str"})
 76            if str_lower:
 77                df[col] = df[col].str.lower()
 78            df[col] = df[col].str.split(split_char)
 79            df = df.explode(col)
 80            df = df.reset_index(drop=True)
 81
 82        if "LCS" in use_cols:
 83            if "TC" in use_cols:
 84                grouped_df = df.groupby(col).agg(
 85                    {col: "count", "LCS": "sum", "TC": "sum"}
 86                )
 87            else:
 88                grouped_df = df.groupby(col).agg({col: "count", "LCS": "sum"})
 89        else:
 90            grouped_df = df.groupby(col).agg({col: "count"})
 91
 92        grouped_df.rename(
 93            columns={col: "Recs", "LCS": "TLCS", "TC": "TGCS"}, inplace=True
 94        )
 95        # e.g. Andersson, Gerhard (7202645907)
 96        if col == "Author full names":
 97            grouped_df.index = grouped_df.index.str.replace(r" \(\d+\)", "", regex=True)
 98
 99        if not sort_by_col:
100            sort_by_col = "Recs"
101        return grouped_df.sort_values(sort_by_col, ascending=False)

A factory method to generate DataFrame of specific field. You can analyze any field besides the provided functions through this method.

Arguments:
  • merged_docs_df: DataFrame of docs merged with citation relationship.
  • use_cols: Columns to use, e.g. ["AU", "LCS", "TC"].
  • col: Column to analyze, e.g. AU.
  • split_char: Whether to split string, e.g. ;. Defaults to None.
  • str_lower: Whether to convert string to lowercase. Defaults to False.
  • sort_by_col: Sort DataFrame by column, Recs, TLCS or TGCS. Defaults to Recs.
Returns:

DataFrame of specific field.

def generate_records_df(self) -> pandas.core.frame.DataFrame:
103    def generate_records_df(self) -> pd.DataFrame:
104        """Return records DataFrame. Similar to `merged_docs_df`."""
105        if self.source_type in ["wos", "scopus"]:
106            use_cols = [
107                "AU",
108                "TI",
109                "SO",
110                "PY",
111                "TI",
112                "LCS",
113                "TC",
114                "LCR",
115                "NR",
116                "source file",
117            ]
118        elif self.source_type == "cssci":
119            use_cols = ["AU", "TI", "SO", "PY", "LCS", "LCR", "NR", "source file"]
120        else:
121            raise ValueError("Invalid source type")
122        records_df = self.merged_docs_df[use_cols]
123        if "TC" in use_cols:
124            records_df = records_df.rename(columns={"TC": "GCS"})
125        if "NR" in use_cols:
126            records_df = records_df.rename(columns={"NR": "GCR"})
127        return records_df

Return records DataFrame. Similar to merged_docs_df.

def generate_author_df(self) -> pandas.core.frame.DataFrame:
129    def generate_author_df(self) -> pd.DataFrame:
130        """Return author DataFrame."""
131        if self.source_type == "wos":
132            use_cols = ["AU", "LCS", "TC"]
133        elif self.source_type == "cssci":
134            use_cols = ["AU", "LCS"]
135        elif self.source_type == "scopus":
136            use_cols = ["Author full names", "LCS", "TC"]
137        else:
138            raise ValueError("Invalid source type")
139        return self.generate_df_factory(
140            self.merged_docs_df, use_cols, use_cols[0], "; "
141        )

Return author DataFrame.

def generate_keyword_df(self) -> pandas.core.frame.DataFrame:
143    def generate_keyword_df(self) -> pd.DataFrame:
144        """Return keyword DataFrame."""
145        if self.source_type in ["wos", "scopus"]:
146            use_cols = ["DE", "LCS", "TC"]
147        elif self.source_type == "cssci":
148            use_cols = ["DE", "LCS"]
149        else:
150            raise ValueError("Invalid source type")
151        return self.generate_df_factory(self.merged_docs_df, use_cols, "DE", "; ", True)

Return keyword DataFrame.

def generate_institution_df(self) -> pandas.core.frame.DataFrame:
153    def generate_institution_df(self) -> pd.DataFrame:
154        """Return institution DataFrame. Not support Scopus."""
155        assert (
156            self.source_type != "scopus"
157        ), "Scopus is not supported to analyze institution field yet."
158        if self.source_type == "wos":
159            use_cols = ["C3", "LCS", "TC"]
160        elif self.source_type == "cssci":
161            use_cols = ["C3", "LCS"]
162        else:
163            raise ValueError("Invalid source type")
164        return self.generate_df_factory(self.merged_docs_df, use_cols, "C3", "; ")

Return institution DataFrame. Not support Scopus.

def generate_journal_df(self) -> pandas.core.frame.DataFrame:
166    def generate_journal_df(self) -> pd.DataFrame:
167        """Return journal DataFrame."""
168        if self.source_type in ["wos", "scopus"]:
169            use_cols = ["SO", "LCS", "TC"]
170        elif self.source_type == "cssci":
171            use_cols = ["SO", "LCS"]
172        else:
173            raise ValueError("Invalid source type")
174        return self.generate_df_factory(self.merged_docs_df, use_cols, "SO")

Return journal DataFrame.

def generate_year_df(self) -> pandas.core.frame.DataFrame:
176    def generate_year_df(self) -> pd.DataFrame:
177        """Return publication year DataFrame. Sort by `PY` ascending."""
178        use_cols = ["PY"]
179        return self.generate_df_factory(
180            self.merged_docs_df, use_cols, "PY"
181        ).sort_values(by="PY")

Return publication year DataFrame. Sort by PY ascending.

def generate_document_type_df(self) -> pandas.core.frame.DataFrame:
183    def generate_document_type_df(self) -> pd.DataFrame:
184        """Return document type DataFrame. Not support CSSCI."""
185        assert self.source_type != "cssci", "CSSCI doesn't have document type info"
186        use_cols = ["DT"]
187        return self.generate_df_factory(self.merged_docs_df, use_cols, "DT")

Return document type DataFrame. Not support CSSCI.

def write2excel(self, save_path: str):
206    def write2excel(self, save_path: str):
207        """Write all dataframes to an excel file. Each dataframe is a sheet.
208
209        Args:
210            save_path: The path to save the excel file.
211
212        Returns:
213            An excel file with multiple sheets.
214        """
215        save_folder_path = os.path.dirname(save_path)
216        if not os.path.exists(save_folder_path):
217            os.makedirs(save_folder_path)
218        with pd.ExcelWriter(save_path) as writer:
219            self.generate_records_df().to_excel(
220                writer, sheet_name="Records", index=False
221            )
222            self.generate_author_df().to_excel(writer, sheet_name="Authors")
223            self.generate_journal_df().to_excel(writer, sheet_name="Journals")
224            self.generate_keyword_df().to_excel(writer, sheet_name="Keywords")
225            self.generate_year_df().to_excel(writer, sheet_name="Years")
226
227            # if self.refs_df is not None:
228            #     self.generate_reference_df().to_excel(
229            #         writer, sheet_name="Cited References", index=False
230            #     )
231            if self.source_type in ["wos", "cssci"]:
232                self.generate_institution_df().to_excel(
233                    writer, sheet_name="Institutions"
234                )
235            if self.source_type in ["wos", "scopus"]:
236                self.generate_document_type_df().to_excel(
237                    writer, sheet_name="Document Type"
238                )

Write all dataframes to an excel file. Each dataframe is a sheet.

Arguments:
  • save_path: The path to save the excel file.
Returns:

An excel file with multiple sheets.

class GraphViz:
  7class GraphViz:
  8    """Generate dot file for Graphviz. Support citation network of multi docs and specific doc.
  9
 10    Attributes:
 11        empty_year_index: Index of docs without `PY` info. These docs will be removed from citation network if `generate_dot_file.show_timeline = True`.
 12        merged_docs_df: DataFrame of docs with citation relationship.
 13        source_type: Source type of docs, `wos`, `cssci` or `scopus`.
 14    """
 15
 16    def __init__(
 17        self,
 18        docs_df: pd.DataFrame,
 19        citation_relationship: pd.DataFrame,
 20        source_type: Literal["wos", "cssci", "scopus"],
 21    ):
 22        """
 23        Args:
 24            docs_df: DataFrame of docs.
 25            citation_relationship: DataFrame of citation relationship.
 26            source_type: Source type of docs, `wos`, `cssci` or `scopus`.
 27        """
 28        self.empty_year_index: pd.Index = docs_df[docs_df["PY"].isna()].index
 29        self.merged_docs_df: pd.DataFrame = docs_df.merge(
 30            citation_relationship,
 31            left_index=True,
 32            right_index=True,
 33            suffixes=(None, "_y"),
 34        ).drop(columns=["doc_index_y"])
 35        self.source_type: Literal["wos", "cssci", "scopus"] = source_type
 36
 37    @staticmethod
 38    def _generate_edge(
 39        doc_index: int,
 40        related_doc_index_list: Union[str, list[int]],
 41        citation_type: Literal["cited", "citing"],
 42    ) -> set[tuple[int, int]]:
 43        if isinstance(related_doc_index_list, str):
 44            related_doc_index_list = [int(i) for i in related_doc_index_list.split(";")]
 45        if citation_type == "cited":
 46            return {(doc_index, ref) for ref in related_doc_index_list}
 47        else:
 48            return {(citation, doc_index) for citation in related_doc_index_list}
 49
 50    def _generate_edge_set_from_specific_doc(
 51        self,
 52        doc_index: int,
 53        edge_type: Literal["cited", "citing"],
 54    ):
 55        def pipeline(doc_index: int):
 56            if edge_type == "cited":
 57                cell = self.merged_docs_df.loc[doc_index, "cited_doc_index"]
 58            else:
 59                cell = self.merged_docs_df.loc[doc_index, "citing_doc_index"]
 60            if isinstance(cell, str):
 61                related_doc_index = [int(i) for i in cell.split(";")]
 62                pending_doc_index.extend(related_doc_index)
 63                if edge_type == "cited":
 64                    edge_set.update(
 65                        self._generate_edge(doc_index, related_doc_index, "cited")
 66                    )
 67                else:
 68                    edge_set.update(
 69                        self._generate_edge(doc_index, related_doc_index, "citing")
 70                    )
 71
 72        edge_set: set[tuple[int, int]] = set()
 73        pending_doc_index: list[int] = []
 74        pipeline(doc_index)
 75        while pending_doc_index:
 76            current_doc_index = pending_doc_index.pop()
 77            pipeline(current_doc_index)
 78        return edge_set
 79
 80    def _generate_edge_set_from_multi_doc(self, doc_indices: list[int]):
 81        edge_set: set[tuple[int, int]] = set()
 82        for idx in doc_indices:
 83            cited_doc_index = self.merged_docs_df.loc[idx, "cited_doc_index"]
 84            citing_doc_index = self.merged_docs_df.loc[idx, "citing_doc_index"]
 85            if isinstance(cited_doc_index, str):
 86                edge_set.update(self._generate_edge(idx, cited_doc_index, "cited"))
 87            if isinstance(citing_doc_index, str):
 88                edge_set.update(self._generate_edge(idx, citing_doc_index, "citing"))
 89        edge_set = {
 90            (edge[0], edge[1])
 91            for edge in edge_set
 92            if edge[0] in doc_indices and edge[1] in doc_indices
 93        }
 94        return edge_set
 95
 96    def _generate_edge_set(self) -> dict[int, list[int]]:
 97        if len(self.doc_indices) > 1:
 98            edge_set = self._generate_edge_set_from_multi_doc(self.doc_indices)
 99        else:
100            initial_doc_index = self.doc_indices[0]
101            if self.edge_type == "cited":
102                edge_set = self._generate_edge_set_from_specific_doc(
103                    initial_doc_index, "cited"
104                )
105            elif self.edge_type == "citing":
106                edge_set = self._generate_edge_set_from_specific_doc(
107                    initial_doc_index, "citing"
108                )
109            elif self.edge_type is None:
110                edge_set = self._generate_edge_set_from_specific_doc(
111                    initial_doc_index, "cited"
112                )
113                edge_set.update(
114                    self._generate_edge_set_from_specific_doc(
115                        initial_doc_index, "citing"
116                    )
117                )
118            else:
119                raise ValueError(
120                    'Argument <edge_type> must be one of "cited", "citing" or None'
121                )
122
123        # Drop nodes without PY info
124        if len(self.empty_year_index) > 0 and self.show_timeline is True:
125            edge_set = {
126                (edge[0], edge[1])
127                for edge in edge_set
128                if edge[0] not in self.empty_year_index
129                and edge[1] not in self.empty_year_index
130            }
131
132        # Build node_list according to edges
133        source_node = set([i for i, _ in edge_set])
134        target_node = set([j for _, j in edge_set])
135        node_list = sorted(source_node | target_node)
136        self.node_list = node_list
137
138        edge_dict = {i: [] for i in sorted(source_node)}
139        for edge in edge_set:
140            edge_dict[edge[0]].append(edge[1])
141        return edge_dict
142
143    def _obtain_groups(self):
144        """Obtain groups of doc_index by year."""
145        year_series = self.merged_docs_df.loc[self.node_list, "PY"]
146        year_groups = year_series.groupby(year_series).groups.items()
147        year_list = [i[0] for i in year_groups]
148        grouped_doc_index = [list(i[1]) for i in year_groups]
149        if self.show_timeline is True:
150            for idx, year in enumerate(year_list):
151                grouped_doc_index[idx].insert(0, year)
152        return grouped_doc_index, year_list
153
154    def generate_dot_file(
155        self,
156        doc_indices: Union[list[int], int],
157        edge_type: Optional[Literal["cited", "citing"]] = None,
158        show_timeline: bool = True,
159    ) -> str:
160        """
161        Args:
162            doc_indices: Specific doc_index or list of doc_index. If list, only show edges between these doc_index.
163            edge_type: Only for specific doc_index. It can be `cited`, `citing` or `None`. If `None`, show both `cited` and `citing` edges. Default `None`.
164            show_timeline: Whether show timeline. In some cases, timeline may be disorderly, so you can set it to `False`. Default `True`.
165
166        Returns:
167            Dot file content.
168        """
169        if isinstance(doc_indices, list) and len(doc_indices) > 1:
170            assert (
171                edge_type is None
172            ), "Argument <edge_type> should be None if <doc_indices> contains >1 elements."
173            self.doc_indices = doc_indices
174        elif isinstance(doc_indices, int):
175            assert (
176                doc_indices in self.merged_docs_df.index
177            ), "Don't select doc_index not in docs_df."
178            assert (
179                doc_indices not in self.empty_year_index
180            ), "Don't select doc_index without <PY> info."
181            self.doc_indices = [doc_indices]
182        self.edge_type = edge_type
183        self.show_timeline = show_timeline
184
185        edge_dict = self._generate_edge_set()
186        grouped_doc_index, year_list = self._obtain_groups()
187
188        dot_groups = [
189            f'\t{{rank=same; {" ".join([str(i) for i in group_index])}}};\n'
190            for group_index in grouped_doc_index
191        ]
192        dot_edge_list = [
193            f"\t{source} -> "
194            + "{ "
195            + " ".join([str(i) for i in edge_dict[source]])
196            + " };\n"
197            for source in edge_dict.keys()
198        ]
199
200        if self.show_timeline is True:
201            reversed_year_list = year_list[::-1]
202            year_edge_list = [
203                (year, reversed_year_list[idx + 1])
204                for idx, year in enumerate(reversed_year_list)
205                if idx < len(reversed_year_list) - 1
206            ]
207            dot_year_node_list = [
208                f'\t{year} [ shape="plaintext" ];\n' for year in year_list
209            ]
210            dot_year_edge_list = [
211                f"\t{edge[0]} -> {edge[1]} [ style = invis ];\n"
212                for edge in year_edge_list
213            ]
214        else:
215            dot_year_node_list, dot_year_edge_list = [], []
216
217        dot_text = "digraph metadata{\n\trankdir = BT;\n"
218        for dot_group in dot_groups:
219            dot_text += dot_group
220
221        for dot_year_node in dot_year_node_list:
222            dot_text += dot_year_node
223
224        for dot_year_edge in dot_year_edge_list:
225            dot_text += dot_year_edge
226
227        for dot_edge in dot_edge_list:
228            dot_text += dot_edge
229        dot_text += "}"
230        return dot_text
231
232    def generate_graph_node_info(self) -> pd.DataFrame:
233        """Generate dataframe of graph node info. Columns differ according to `source_type`.
234
235        Returns:
236            Dataframe of graph node info.
237        """
238        if self.source_type == "wos":
239            use_cols = ["doc_index", "AU", "TI", "PY", "SO", "LCS", "TC"]
240        elif self.source_type == "cssci":
241            use_cols = ["doc_index", "AU", "TI", "PY", "SO", "LCS"]
242        elif self.source_type == "scopus":
243            use_cols = ["doc_index", "AU", "TI", "PY", "SO", "LCS", "TC"]
244        else:
245            raise ValueError("invalid source type")
246        graph_node_info = self.merged_docs_df.loc[self.node_list, use_cols]
247        if "TC" in graph_node_info.columns:
248            graph_node_info.rename(columns={"TC": "GCS"}, inplace=True)
249        return graph_node_info
250
251    def _export_graph_node_info(self, file_path: str):
252        self.generate_graph_node_info().to_excel(file_path, index=False)

Generate dot file for Graphviz. Support citation network of multi docs and specific doc.

Attributes:
  • empty_year_index: Index of docs without PY info. These docs will be removed from citation network if generate_dot_file.show_timeline = True.
  • merged_docs_df: DataFrame of docs with citation relationship.
  • source_type: Source type of docs, wos, cssci or scopus.
GraphViz( docs_df: pandas.core.frame.DataFrame, citation_relationship: pandas.core.frame.DataFrame, source_type: Literal['wos', 'cssci', 'scopus'])
16    def __init__(
17        self,
18        docs_df: pd.DataFrame,
19        citation_relationship: pd.DataFrame,
20        source_type: Literal["wos", "cssci", "scopus"],
21    ):
22        """
23        Args:
24            docs_df: DataFrame of docs.
25            citation_relationship: DataFrame of citation relationship.
26            source_type: Source type of docs, `wos`, `cssci` or `scopus`.
27        """
28        self.empty_year_index: pd.Index = docs_df[docs_df["PY"].isna()].index
29        self.merged_docs_df: pd.DataFrame = docs_df.merge(
30            citation_relationship,
31            left_index=True,
32            right_index=True,
33            suffixes=(None, "_y"),
34        ).drop(columns=["doc_index_y"])
35        self.source_type: Literal["wos", "cssci", "scopus"] = source_type
Arguments:
  • docs_df: DataFrame of docs.
  • citation_relationship: DataFrame of citation relationship.
  • source_type: Source type of docs, wos, cssci or scopus.
empty_year_index: pandas.core.indexes.base.Index
merged_docs_df: pandas.core.frame.DataFrame
source_type: Literal['wos', 'cssci', 'scopus']
def generate_dot_file( self, doc_indices: Union[list[int], int], edge_type: Optional[Literal['cited', 'citing']] = None, show_timeline: bool = True) -> str:
154    def generate_dot_file(
155        self,
156        doc_indices: Union[list[int], int],
157        edge_type: Optional[Literal["cited", "citing"]] = None,
158        show_timeline: bool = True,
159    ) -> str:
160        """
161        Args:
162            doc_indices: Specific doc_index or list of doc_index. If list, only show edges between these doc_index.
163            edge_type: Only for specific doc_index. It can be `cited`, `citing` or `None`. If `None`, show both `cited` and `citing` edges. Default `None`.
164            show_timeline: Whether show timeline. In some cases, timeline may be disorderly, so you can set it to `False`. Default `True`.
165
166        Returns:
167            Dot file content.
168        """
169        if isinstance(doc_indices, list) and len(doc_indices) > 1:
170            assert (
171                edge_type is None
172            ), "Argument <edge_type> should be None if <doc_indices> contains >1 elements."
173            self.doc_indices = doc_indices
174        elif isinstance(doc_indices, int):
175            assert (
176                doc_indices in self.merged_docs_df.index
177            ), "Don't select doc_index not in docs_df."
178            assert (
179                doc_indices not in self.empty_year_index
180            ), "Don't select doc_index without <PY> info."
181            self.doc_indices = [doc_indices]
182        self.edge_type = edge_type
183        self.show_timeline = show_timeline
184
185        edge_dict = self._generate_edge_set()
186        grouped_doc_index, year_list = self._obtain_groups()
187
188        dot_groups = [
189            f'\t{{rank=same; {" ".join([str(i) for i in group_index])}}};\n'
190            for group_index in grouped_doc_index
191        ]
192        dot_edge_list = [
193            f"\t{source} -> "
194            + "{ "
195            + " ".join([str(i) for i in edge_dict[source]])
196            + " };\n"
197            for source in edge_dict.keys()
198        ]
199
200        if self.show_timeline is True:
201            reversed_year_list = year_list[::-1]
202            year_edge_list = [
203                (year, reversed_year_list[idx + 1])
204                for idx, year in enumerate(reversed_year_list)
205                if idx < len(reversed_year_list) - 1
206            ]
207            dot_year_node_list = [
208                f'\t{year} [ shape="plaintext" ];\n' for year in year_list
209            ]
210            dot_year_edge_list = [
211                f"\t{edge[0]} -> {edge[1]} [ style = invis ];\n"
212                for edge in year_edge_list
213            ]
214        else:
215            dot_year_node_list, dot_year_edge_list = [], []
216
217        dot_text = "digraph metadata{\n\trankdir = BT;\n"
218        for dot_group in dot_groups:
219            dot_text += dot_group
220
221        for dot_year_node in dot_year_node_list:
222            dot_text += dot_year_node
223
224        for dot_year_edge in dot_year_edge_list:
225            dot_text += dot_year_edge
226
227        for dot_edge in dot_edge_list:
228            dot_text += dot_edge
229        dot_text += "}"
230        return dot_text
Arguments:
  • doc_indices: Specific doc_index or list of doc_index. If list, only show edges between these doc_index.
  • edge_type: Only for specific doc_index. It can be cited, citing or None. If None, show both cited and citing edges. Default None.
  • show_timeline: Whether show timeline. In some cases, timeline may be disorderly, so you can set it to False. Default True.
Returns:

Dot file content.

def generate_graph_node_info(self) -> pandas.core.frame.DataFrame:
232    def generate_graph_node_info(self) -> pd.DataFrame:
233        """Generate dataframe of graph node info. Columns differ according to `source_type`.
234
235        Returns:
236            Dataframe of graph node info.
237        """
238        if self.source_type == "wos":
239            use_cols = ["doc_index", "AU", "TI", "PY", "SO", "LCS", "TC"]
240        elif self.source_type == "cssci":
241            use_cols = ["doc_index", "AU", "TI", "PY", "SO", "LCS"]
242        elif self.source_type == "scopus":
243            use_cols = ["doc_index", "AU", "TI", "PY", "SO", "LCS", "TC"]
244        else:
245            raise ValueError("invalid source type")
246        graph_node_info = self.merged_docs_df.loc[self.node_list, use_cols]
247        if "TC" in graph_node_info.columns:
248            graph_node_info.rename(columns={"TC": "GCS"}, inplace=True)
249        return graph_node_info

Generate dataframe of graph node info. Columns differ according to source_type.

Returns:

Dataframe of graph node info.

class ParseReference:
 51class ParseReference:
 52    @staticmethod
 53    def _parse_wos_ref(
 54        ref: str, doc_index: Optional[int] = None
 55    ) -> Optional[dict[str, Optional[str]]]:
 56        """Parse reference of Web of Science.
 57
 58        Args:
 59            ref: A reference string. e.g. `CORTES C, 1995, MACH LEARN, V20, P273, DOI 10.1007/BF00994018`
 60            doc_index: doc_index to which the reference belongs. Default is None.
 61
 62        Returns:
 63            A dict of reference fields.
 64        """
 65        # Refs contain another language except english or AU is anonymous
 66        if re.search(r"[\[\]]", ref):
 67            return None
 68
 69        # Don't parse patent
 70        if "Patent No." in ref:
 71            return None
 72
 73        AU, PY, J9, VL, BP, DI = None, None, None, None, None, None
 74
 75        if ", DOI " in ref:
 76            # Contain only one DOI
 77            if "DOI [" not in ref:
 78                DI_match = re.search(r"DOI (10.*)$", ref)
 79                DI = DI_match[1] if DI_match else None
 80            # Contain multi DOIs
 81            else:
 82                DI_match = re.search(r"DOI \[(.*)\]", ref)
 83                DI = DI_match[1] if DI_match else None
 84            ref = re.sub(r", DOI.*", "", ref)
 85
 86        BP_match = re.search(r", [Pp]([A-Za-z]?\d+)$", ref)
 87        if BP_match:
 88            BP = BP_match[1]
 89            ref = re.sub(r", [Pp][A-Za-z]?\d+", "", ref)
 90
 91        ref = re.sub(r"[,\.] PROCEEDINGS(?=, )", "", ref, flags=re.I)
 92        if VL_match := re.search(r", V([\d-]+)$", ref):
 93            VL = VL_match[1]
 94            sub_pattern = r", V[\d-]+$"
 95
 96        elif re.search(r", VOL[s\.]? ", ref, re.I):
 97            VL_match = re.search(r", VOL[s\.]? ([\w\- ]+)$", ref, re.I)
 98            VL = VL_match[1] if VL_match else None
 99            sub_pattern = r", V[Oo][Ll].*"
100
101        elif VL_match := re.search(r"(?<=[A-Z\.]), V([\w\. ]+)$", ref):
102            VL = VL_match[1]
103            sub_pattern = r"(?<=[A-Z\.]), V[\w\. ]+$"
104
105        else:
106            sub_pattern = None
107
108        if sub_pattern:
109            ref = re.sub(sub_pattern, "", ref)
110
111        dot_count = ref.count(", ")
112        if dot_count == 2:
113            AU, PY, J9 = ref.split(", ")
114        elif dot_count > 2:
115            PY_pattern = r", (\d{4}), "
116            if re.search(PY_pattern, ref):
117                AU, PY, J9 = re.split(PY_pattern, ref, 1)
118        else:
119            return None
120
121        if DI:
122            DI = DI.lower()
123            if len(re.findall(", ", DI)) == 1:
124                try:
125                    DI1, DI2 = DI.replace("doi ", "").split(", ")
126                except:
127                    return None
128                if DI1 == DI2:
129                    DI = DI1
130                else:
131                    DI = None
132
133        if PY and not re.match(r"^\d{4}$", PY):
134            PY = None
135
136        result = asdict(WosField(AU, PY, J9, VL, BP, DI))
137        if doc_index is not None:
138            result["doc_index"] = doc_index
139        return result
140
141    @staticmethod
142    def _parse_cssci_ref(
143        ref: str, doc_index: Optional[int] = None
144    ) -> Optional[dict[str, Optional[str]]]:
145        """Parse reference of CSSCI. Only parse reference in Chinese language.
146
147        Args:
148            ref: A reference string. e.g. `1.严栋.基于物联网的智慧图书馆.图书馆学刊.2010.32(7)`
149            doc_index: doc_index to which the reference belongs. Default is None.
150
151        Returns:
152            A dict of reference fields.
153        """
154        dot_pattern = re.compile(
155            r"(?<!\d)\.(?!\d)|(?<=\d)\.(?!\d)|(?<!\d)\.(?=\d)|(?<=\d{4})\.(?=\d)|(?<=\d)\.(?=\d{4})"
156        )
157
158        if re.search(r"[\u4e00-\u9fa5]", ref):
159            dot_count = len(dot_pattern.findall(ref))
160
161            if re.search(r"[^\d]\.{2,}", ref):
162                return None
163
164            # Dissertation
165            elif ":学位论文." in ref:
166                try:
167                    _, AU, TI, other = ref.split(".")
168                except:
169                    return None
170                else:
171                    TI = TI.replace(":学位论文", "")
172                    SO, PY = other.split(",")
173                    PY = PY.split(":")[0]
174                    raw_result = CssciField(AU, TI, SO, PY, None)
175
176            # Country standard
177            elif "GB/T" in ref:
178                if ref[-3:] == "出版社":
179                    _, AU, other = ref.split(".", 2)
180                    TI, SO = other.rsplit(".", 1)
181                    raw_result = CssciField(AU, TI, SO, None, None)
182                else:
183                    _, AU, TI = ref.split(".", 2)
184                    raw_result = CssciField(AU, TI, None, None, None)
185
186            # Standard
187            elif re.search(r":DB\d{2}/T", ref):
188                _, AU, other = ref.split(".", 2)
189                TI, PY = other.rsplit(".", 1)
190                raw_result = CssciField(AU, TI, None, PY, None)
191
192            # Newspaper
193            elif re.search(r"\.\d{1,2}\.\d{1,2}(?:\(|$)", ref):
194                try:
195                    _, AU, TI, SO, other = re.split(dot_pattern, ref, 4)
196                except:
197                    return None
198                else:
199                    raw_result = CssciField(AU, TI, SO, None, None)
200
201            # Patent1
202            elif re.search(r"\.CN\d{9}[A-Z]$", ref):
203                TI = ref.split(".", 1)[1]
204                raw_result = CssciField(None, TI, None, None, None)
205            # Patent2
206            elif re.search(r"^\d+\.一种", ref):
207                date_pattern = re.compile(r"\d{4}\-\d{1,2}\-\d{1,2}")
208                TI = ref.split(".", 1)[1]
209                date = date_pattern.search(ref)
210                if date:
211                    PY = date[0].split("-")[0]
212                else:
213                    PY = None
214                TI = date_pattern.sub("", TI).strip(".()")
215                raw_result = CssciField(None, TI, None, PY, None)
216
217            # Network resource
218            elif re.search(r"\.\d{4}$", ref):
219                if dot_count == 3:
220                    _, AU, TI, PY = re.split(dot_pattern, ref)
221                elif dot_count == 4:
222                    _, AU, TI, SO, PY = re.split(dot_pattern, ref)
223                else:
224                    return None
225                raw_result = CssciField(AU, TI, None, PY, None)
226
227            # Journal1
228            elif dot_count == 5:
229                _, AU, TI, SO, PY, VL = re.split(dot_pattern, ref)
230                raw_result = CssciField(AU, TI, SO, PY, VL)
231            # Journal2
232            elif dot_count == 4:
233                _, AU, TI, SO, _ = re.split(dot_pattern, ref)
234                raw_result = CssciField(AU, TI, SO, None, None)
235
236            # Book
237            elif dot_count == 3:
238                _, AU, TI, SO = re.split(dot_pattern, ref)
239                raw_result = CssciField(AU, TI, SO, None, None)
240
241            elif dot_count == 2:
242                _, AU, TI = re.split(dot_pattern, ref)
243                raw_result = CssciField(AU, TI, None, None, None)
244
245            elif dot_count == 1:
246                _, TI = re.split(dot_pattern, ref)
247                raw_result = CssciField(None, TI, None, None, None)
248            else:
249                return None
250
251            result = asdict(raw_result)
252            if doc_index is not None:
253                result["doc_index"] = doc_index
254            return result
255
256    @staticmethod
257    def _parse_scopus_ref(
258        ref: str, doc_index: Optional[int] = None
259    ) -> Optional[dict[str, Optional[str]]]:
260        """Parse reference of Scopus.
261
262        Args:
263            ref: A reference string. e.g. `Negri E, Fumagalli L, Macchi M., A Review of the Roles of Digital Twin in CPS-based Production Systems, Procedia Manufacturing, 11, pp. 939-948, (2017)`
264            doc_index: doc_index to which the reference belongs. Default is None.
265
266        Returns:
267            A dict of reference fields.
268        """
269        if re.search(r"^[^A-Z\*\']", ref):
270            return None
271
272        if re.search(r"[\[\]]", ref):
273            return None
274
275        if ref.count(", ") < 2:
276            return None
277
278        # Publication year
279        PY_match = re.search(r", \((\d{4})\)$", ref)
280        if PY_match:
281            PY = PY_match[1]
282            ref = ref.rsplit(", ", 1)[0]
283        else:
284            return None
285
286        First_AU, TI, SO, VL, IS, BP, EP = None, None, None, None, None, None, None
287
288        # remove version info
289        ref = re.sub(r", version [\d\.]+(?=,)", "", ref, flags=re.I)
290
291        # remove doi info
292        ref = re.sub(r", doi:.*(?=,|$)", "", ref, flags=re.I)
293
294        # remove retrieval info
295        ref = re.sub(r"[\.,] Retrieved.*(?=,)", "", ref, flags=re.I)
296        ref = re.sub(r", Available from:(?=,)", "", ref, flags=re.I)
297
298        # Page number
299        if PP_match := re.search(r"(?:, | \()[Pp]{2}[\.,] ([\w\-]+)\)?", ref):
300            PP = PP_match[1]
301            try:
302                BP, EP = re.split(r"-", PP, 1)
303            except:
304                BP, EP = None, None
305            ref = re.sub(r"(?:, | \()[Pp]{2}.*", "", ref)
306
307        # Volume and Issue
308        if VL_IS_match := re.search(r", (\d+\s?[A-Za-z]*, [\w\s\-\.\–]+)$", ref):
309            VL, IS = VL_IS_match[1].split(", ")
310            ref = ref.rsplit(", ", 2)[0]
311
312        elif IS_match := re.search(r", ([\w-]* ?suppl\.? ?[\w-]*)$", ref, re.I):
313            IS = IS_match[1]
314            ref = ref.rsplit(", ", 1)[0]
315
316        elif IS_match := re.search(r", (\d* ?PART\.? [A-Z\d]+)$", ref, re.I):
317            IS = IS_match[1]
318            ref = ref.rsplit(", ", 1)[0]
319
320        elif IS_match := re.search(r", ([Nn]o\. \d+)$", ref):
321            IS = IS_match[1]
322            ref = ref.rsplit(", ", 1)[0]
323
324        if VL_match := re.search(r", (\d+)$", ref):
325            VL = VL_match[1]
326            ref = ref.rsplit(", ", 1)[0]
327
328        elif VL_match := re.search(r", ([Vv]ol\. [\w\s\.:]+)$", ref):
329            VL = VL_match[1]
330            ref = ref.rsplit(", ", 1)[0]
331
332        # Author
333        full_name_pattern = r"^(?:[a-zA-Z][a-zA-Z\-\.\']*\s)+[A-Z][a-zA-Z\-\.\']*(, |$)"
334        if re.search(r"Et al\.", ref, flags=re.I):
335            First_AU = ref.split(", ")[0]
336            ref = re.sub(r"^.*Et al\.,?\s?", "", ref, flags=re.I)
337
338        elif "., " in ref:
339            AU = ref.rsplit("., ", 1)[0]
340            if "," in AU:
341                First_AU = AU.split(", ")[0]
342            else:
343                First_AU = AU + "."
344            ref = ref.replace(f"{AU}., ", "")
345
346        elif re.search(r"^(?:[A-Z][a-zA-Z]*\s)+[A-Z][a-zA-Z]*(?=, )", ref):
347            First_AU = ref.split(", ", 1)[0]
348            ref = ref.replace(f"{First_AU}, ", "")
349
350        elif re.search(r"^[A-Z-]+, (?=[A-Z])", ref):
351            First_AU = ref.split(", ", 1)[0]
352            ref = ref.replace(f"{First_AU}, ", "")
353
354        elif re.search(full_name_pattern, ref):
355            First_AU = re.split(", ", ref, 1)[0]
356            while re.search(full_name_pattern, ref):
357                ref = re.sub(full_name_pattern, "", ref, 1)
358
359        else:
360            return None
361
362        # Title and Source
363        if ref != "":
364            comma_pattern = r", (?![^\[]*\]|[^(]*\))"
365            comma_count = len(re.findall(comma_pattern, ref))
366            if comma_count == 0:
367                TI = ref
368            elif comma_count == 1:
369                TI, SO = re.split(comma_pattern, ref)
370            else:
371                # conference ref
372                if re.search(
373                    r"[Cc]onference|Conf\.|[Pp]roceeding|Proc\.|[Cc]ommittee|[Ss]ymposium|[Cc]onvention|[Cc]ongress",
374                    ref,
375                ):
376                    TI, SO = ref.split(", ", 1)
377
378                # match source
379                elif SO_match := re.search(r", ([A-Z\d][\w\s\.\-&:]+)$", ref):
380                    SO = SO_match[1]
381                    TI = ref.replace(f", {SO}", "")
382
383                # match title
384                elif TI_match := re.search(
385                    r"^(([^\.\s]+ ){3,}[^\.\sA-Z]+), [A-Z]", ref
386                ):
387                    TI = TI_match[1]
388                    SO = ref.replace(f"{TI}, ", "")
389
390                elif re.search(r"^[A-Z][^A-Z]+$", ref):
391                    TI = ref
392
393                else:
394                    return None
395
396        result = asdict(ScopusField(First_AU, TI, SO, VL, IS, BP, EP, PY))
397        if doc_index is not None:
398            result["doc_index"] = doc_index
399        return result
400
401    def parse_one_ref(
402        self, ref: str, source_type: Literal["wos", "cssci", "scopus"], doc_index=None
403    ):
404        """Parse a raw reference string.
405
406        Args:
407            ref: A raw reference string.
408            source_type: Source type, `wos`, `cssci` or `scopus`.
409            doc_index: doc_index to which the reference belongs. Default is `None`.
410
411        Returns:
412            Parsed reference as a dict.
413
414        Example:
415            >>> from histcite.parse_reference import ParseReference
416            >>> ref = 'Bengio Y, 2001, ADV NEUR IN, V13, P932'
417            >>> parsed_ref = ParseReference().parse_one_ref(ref, 'wos')
418            >>> print(parsed_ref)
419            {'First_AU': 'Bengio Y', 'PY': '2001', 'J9': 'ADV NEUR IN', 'VL': '13', 'BP': '932'}
420        """
421        if source_type == "wos":
422            return self._parse_wos_ref(ref, doc_index)
423        elif source_type == "cssci":
424            return self._parse_cssci_ref(ref, doc_index)
425        elif source_type == "scopus":
426            return self._parse_scopus_ref(ref, doc_index)
427        else:
428            raise ValueError("Invalid source type")
429
430    def parse_ref_cell(
431        self,
432        ref_cell: str,
433        source_type: Literal["wos", "cssci", "scopus"],
434        doc_index=None,
435    ) -> Optional[list[dict[str, Optional[str]]]]:
436        """Parse a reference cell.
437
438        Args:
439            ref_cell: A reference cell.
440            source_type: Source type, `wos`, `cssci` or `scopus`.
441            doc_index: doc_index to which the reference cell belongs. Default is `None`.
442
443        Returns:
444            List of parsed references.
445
446        Example:
447            >>> from histcite.parse_reference import ParseReference
448            >>> ref_cell = 'Bengio Y, 2001, ADV NEUR IN, V13, P932; CORTES C, 1995, MACH LEARN, V20, P273, DOI 10.1007/BF00994018'
449            >>> parsed_ref_list = ParseReference().parse_ref_cell(ref_cell, 'wos')
450            >>> print(parsed_ref_list)
451            [{'First_AU': 'Bengio Y', 'PY': '2001', 'J9': 'ADV NEUR IN', 'VL': '13', 'BP': '932'}, {'First_AU': 'CORTES C', 'PY': '1995', 'J9': 'MACH LEARN', 'VL': '20', 'BP': '273'}]
452        """
453        sep = "; "
454        try:
455            ref_list = re.split(sep, ref_cell)
456        except:
457            return None
458
459        parsed_ref_list: list[Optional[dict[str, Optional[str]]]]
460        parsed_ref_list = [
461            self.parse_one_ref(ref, source_type, doc_index) for ref in ref_list
462        ]
463        parse_ref_list = [ref for ref in parsed_ref_list if ref is not None]
464        return parse_ref_list
def parse_one_ref( self, ref: str, source_type: Literal['wos', 'cssci', 'scopus'], doc_index=None):
401    def parse_one_ref(
402        self, ref: str, source_type: Literal["wos", "cssci", "scopus"], doc_index=None
403    ):
404        """Parse a raw reference string.
405
406        Args:
407            ref: A raw reference string.
408            source_type: Source type, `wos`, `cssci` or `scopus`.
409            doc_index: doc_index to which the reference belongs. Default is `None`.
410
411        Returns:
412            Parsed reference as a dict.
413
414        Example:
415            >>> from histcite.parse_reference import ParseReference
416            >>> ref = 'Bengio Y, 2001, ADV NEUR IN, V13, P932'
417            >>> parsed_ref = ParseReference().parse_one_ref(ref, 'wos')
418            >>> print(parsed_ref)
419            {'First_AU': 'Bengio Y', 'PY': '2001', 'J9': 'ADV NEUR IN', 'VL': '13', 'BP': '932'}
420        """
421        if source_type == "wos":
422            return self._parse_wos_ref(ref, doc_index)
423        elif source_type == "cssci":
424            return self._parse_cssci_ref(ref, doc_index)
425        elif source_type == "scopus":
426            return self._parse_scopus_ref(ref, doc_index)
427        else:
428            raise ValueError("Invalid source type")

Parse a raw reference string.

Arguments:
  • ref: A raw reference string.
  • source_type: Source type, wos, cssci or scopus.
  • doc_index: doc_index to which the reference belongs. Default is None.
Returns:

Parsed reference as a dict.

Example:
>>> from histcite.parse_reference import ParseReference
>>> ref = 'Bengio Y, 2001, ADV NEUR IN, V13, P932'
>>> parsed_ref = ParseReference().parse_one_ref(ref, 'wos')
>>> print(parsed_ref)
{'First_AU': 'Bengio Y', 'PY': '2001', 'J9': 'ADV NEUR IN', 'VL': '13', 'BP': '932'}
def parse_ref_cell( self, ref_cell: str, source_type: Literal['wos', 'cssci', 'scopus'], doc_index=None) -> Optional[list[dict[str, Optional[str]]]]:
430    def parse_ref_cell(
431        self,
432        ref_cell: str,
433        source_type: Literal["wos", "cssci", "scopus"],
434        doc_index=None,
435    ) -> Optional[list[dict[str, Optional[str]]]]:
436        """Parse a reference cell.
437
438        Args:
439            ref_cell: A reference cell.
440            source_type: Source type, `wos`, `cssci` or `scopus`.
441            doc_index: doc_index to which the reference cell belongs. Default is `None`.
442
443        Returns:
444            List of parsed references.
445
446        Example:
447            >>> from histcite.parse_reference import ParseReference
448            >>> ref_cell = 'Bengio Y, 2001, ADV NEUR IN, V13, P932; CORTES C, 1995, MACH LEARN, V20, P273, DOI 10.1007/BF00994018'
449            >>> parsed_ref_list = ParseReference().parse_ref_cell(ref_cell, 'wos')
450            >>> print(parsed_ref_list)
451            [{'First_AU': 'Bengio Y', 'PY': '2001', 'J9': 'ADV NEUR IN', 'VL': '13', 'BP': '932'}, {'First_AU': 'CORTES C', 'PY': '1995', 'J9': 'MACH LEARN', 'VL': '20', 'BP': '273'}]
452        """
453        sep = "; "
454        try:
455            ref_list = re.split(sep, ref_cell)
456        except:
457            return None
458
459        parsed_ref_list: list[Optional[dict[str, Optional[str]]]]
460        parsed_ref_list = [
461            self.parse_one_ref(ref, source_type, doc_index) for ref in ref_list
462        ]
463        parse_ref_list = [ref for ref in parsed_ref_list if ref is not None]
464        return parse_ref_list

Parse a reference cell.

Arguments:
  • ref_cell: A reference cell.
  • source_type: Source type, wos, cssci or scopus.
  • doc_index: doc_index to which the reference cell belongs. Default is None.
Returns:

List of parsed references.

Example:
>>> from histcite.parse_reference import ParseReference
>>> ref_cell = 'Bengio Y, 2001, ADV NEUR IN, V13, P932; CORTES C, 1995, MACH LEARN, V20, P273, DOI 10.1007/BF00994018'
>>> parsed_ref_list = ParseReference().parse_ref_cell(ref_cell, 'wos')
>>> print(parsed_ref_list)
[{'First_AU': 'Bengio Y', 'PY': '2001', 'J9': 'ADV NEUR IN', 'VL': '13', 'BP': '932'}, {'First_AU': 'CORTES C', 'PY': '1995', 'J9': 'MACH LEARN', 'VL': '20', 'BP': '273'}]
class ProcessFile:
  8class ProcessFile:
  9    """Process docs file, extract references and citation relationship.
 10
 11    Attributes:
 12        docs_df: DataFrame of docs.
 13        source_type: Source type of docs, `wos`, `cssci` or `scopus`.
 14    """
 15
 16    def __init__(
 17        self, docs_df: pd.DataFrame, source_type: Literal["wos", "cssci", "scopus"]
 18    ):
 19        """
 20        Args:
 21            docs_df: DataFrame of docs.
 22            source_type: Source type of docs, `wos`, `cssci` or `scopus`.
 23        """
 24        self.docs_df: pd.DataFrame = docs_df.copy()
 25        self.source_type: Literal["wos", "cssci", "scopus"] = source_type
 26
 27    @staticmethod
 28    def _concat_refs(
 29        cr_field_series: pd.Series,
 30        source_type: Literal["wos", "cssci", "scopus"],
 31    ) -> pd.DataFrame:
 32        """Concat all parsed references and return dataframe.
 33
 34        Args:
 35            cr_field_series: The CR field of docs_df.
 36            source_type: Source type of docs, 'wos', 'cssci' or 'scopus'.
 37
 38        Returns:
 39            DataFrame of references.
 40        """
 41        total_ref_list: list[dict[str, Optional[str]]] = []
 42        for idx, cell in cr_field_series.items():
 43            parse_result = ParseReference().parse_ref_cell(cell, source_type, idx)
 44            if parse_result is not None:
 45                total_ref_list.extend(parse_result)
 46        refs_df = pd.DataFrame(total_ref_list)
 47        return refs_df
 48
 49    def extract_reference(self) -> pd.DataFrame:
 50        """Extract total references and return reference dataframe."""
 51        cr_field_series = self.docs_df["CR"]
 52        if self.source_type == "wos":
 53            refs_df = self._concat_refs(cr_field_series, "wos")
 54        elif self.source_type == "cssci":
 55            refs_df = self._concat_refs(cr_field_series, "cssci")
 56        elif self.source_type == "scopus":
 57            refs_df = self._concat_refs(cr_field_series, "scopus")
 58        else:
 59            raise ValueError("Invalid source type")
 60
 61        # Maybe duplicate reference in some docs' references
 62        refs_df.drop_duplicates(ignore_index=True, inplace=True)
 63        refs_df.insert(0, "ref_index", refs_df.index)
 64        return refs_df
 65
 66    @staticmethod
 67    def _reference2citation(cited_doc_index_series: pd.Series) -> pd.Series:
 68        citing_doc_index_series = pd.Series(
 69            [[] for i in range(len(cited_doc_index_series))]
 70        )
 71        for doc_index, ref_list in cited_doc_index_series.items():
 72            if len(ref_list) > 0:
 73                for ref_index in ref_list:
 74                    citing_doc_index_series[ref_index].append(doc_index)
 75        return citing_doc_index_series
 76
 77    def process_citation(self, refs_df: pd.DataFrame) -> pd.DataFrame:
 78        """Return citation_relationship dataframe."""
 79        if self.source_type == "wos":
 80            self.docs_df["DI"] = self.docs_df["DI"].str.lower()
 81            refs_df = refs_df.astype({"PY": "int64[pyarrow]"})
 82            (
 83                cited_doc_index_series,
 84                local_refs_series,
 85            ) = RecognizeReference.recognize_wos_reference(self.docs_df, refs_df)
 86
 87        elif self.source_type == "cssci":
 88            self.docs_df["TI"] = self.docs_df["TI"].str.lower()
 89            refs_df["TI"] = refs_df["TI"].str.lower()
 90            (
 91                cited_doc_index_series,
 92                local_refs_series,
 93            ) = RecognizeReference.recognize_cssci_reference(self.docs_df, refs_df)
 94
 95        elif self.source_type == "scopus":
 96            self.docs_df["TI"] = self.docs_df["TI"].str.lower()
 97            refs_df["TI"] = refs_df["TI"].str.lower()
 98            (
 99                cited_doc_index_series,
100                local_refs_series,
101            ) = RecognizeReference.recognize_scopus_reference(self.docs_df, refs_df)
102        else:
103            raise ValueError("Invalid source type")
104
105        cited_doc_index_series = cited_doc_index_series.reindex(
106            self.docs_df["doc_index"]
107        )
108        cited_doc_index_series = cited_doc_index_series.apply(
109            lambda x: x if isinstance(x, list) else []
110        )
111        citing_doc_index_series = self._reference2citation(cited_doc_index_series)
112        lcr_field = cited_doc_index_series.apply(len)
113        lcs_field = citing_doc_index_series.apply(len)
114        citation_relationship = pd.DataFrame({"doc_index": self.docs_df.doc_index})
115        citation_relationship["cited_doc_index"] = [
116            ";".join([str(j) for j in i]) if i else None for i in cited_doc_index_series
117        ]
118        citation_relationship["citing_doc_index"] = [
119            ";".join([str(j) for j in i]) if i else None
120            for i in citing_doc_index_series
121        ]
122        citation_relationship["LCR"] = lcr_field
123        citation_relationship["LCS"] = lcs_field
124        return citation_relationship

Process docs file, extract references and citation relationship.

Attributes:
  • docs_df: DataFrame of docs.
  • source_type: Source type of docs, wos, cssci or scopus.
ProcessFile( docs_df: pandas.core.frame.DataFrame, source_type: Literal['wos', 'cssci', 'scopus'])
16    def __init__(
17        self, docs_df: pd.DataFrame, source_type: Literal["wos", "cssci", "scopus"]
18    ):
19        """
20        Args:
21            docs_df: DataFrame of docs.
22            source_type: Source type of docs, `wos`, `cssci` or `scopus`.
23        """
24        self.docs_df: pd.DataFrame = docs_df.copy()
25        self.source_type: Literal["wos", "cssci", "scopus"] = source_type
Arguments:
  • docs_df: DataFrame of docs.
  • source_type: Source type of docs, wos, cssci or scopus.
docs_df: pandas.core.frame.DataFrame
source_type: Literal['wos', 'cssci', 'scopus']
def extract_reference(self) -> pandas.core.frame.DataFrame:
49    def extract_reference(self) -> pd.DataFrame:
50        """Extract total references and return reference dataframe."""
51        cr_field_series = self.docs_df["CR"]
52        if self.source_type == "wos":
53            refs_df = self._concat_refs(cr_field_series, "wos")
54        elif self.source_type == "cssci":
55            refs_df = self._concat_refs(cr_field_series, "cssci")
56        elif self.source_type == "scopus":
57            refs_df = self._concat_refs(cr_field_series, "scopus")
58        else:
59            raise ValueError("Invalid source type")
60
61        # Maybe duplicate reference in some docs' references
62        refs_df.drop_duplicates(ignore_index=True, inplace=True)
63        refs_df.insert(0, "ref_index", refs_df.index)
64        return refs_df

Extract total references and return reference dataframe.

def process_citation( self, refs_df: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame:
 77    def process_citation(self, refs_df: pd.DataFrame) -> pd.DataFrame:
 78        """Return citation_relationship dataframe."""
 79        if self.source_type == "wos":
 80            self.docs_df["DI"] = self.docs_df["DI"].str.lower()
 81            refs_df = refs_df.astype({"PY": "int64[pyarrow]"})
 82            (
 83                cited_doc_index_series,
 84                local_refs_series,
 85            ) = RecognizeReference.recognize_wos_reference(self.docs_df, refs_df)
 86
 87        elif self.source_type == "cssci":
 88            self.docs_df["TI"] = self.docs_df["TI"].str.lower()
 89            refs_df["TI"] = refs_df["TI"].str.lower()
 90            (
 91                cited_doc_index_series,
 92                local_refs_series,
 93            ) = RecognizeReference.recognize_cssci_reference(self.docs_df, refs_df)
 94
 95        elif self.source_type == "scopus":
 96            self.docs_df["TI"] = self.docs_df["TI"].str.lower()
 97            refs_df["TI"] = refs_df["TI"].str.lower()
 98            (
 99                cited_doc_index_series,
100                local_refs_series,
101            ) = RecognizeReference.recognize_scopus_reference(self.docs_df, refs_df)
102        else:
103            raise ValueError("Invalid source type")
104
105        cited_doc_index_series = cited_doc_index_series.reindex(
106            self.docs_df["doc_index"]
107        )
108        cited_doc_index_series = cited_doc_index_series.apply(
109            lambda x: x if isinstance(x, list) else []
110        )
111        citing_doc_index_series = self._reference2citation(cited_doc_index_series)
112        lcr_field = cited_doc_index_series.apply(len)
113        lcs_field = citing_doc_index_series.apply(len)
114        citation_relationship = pd.DataFrame({"doc_index": self.docs_df.doc_index})
115        citation_relationship["cited_doc_index"] = [
116            ";".join([str(j) for j in i]) if i else None for i in cited_doc_index_series
117        ]
118        citation_relationship["citing_doc_index"] = [
119            ";".join([str(j) for j in i]) if i else None
120            for i in citing_doc_index_series
121        ]
122        citation_relationship["LCR"] = lcr_field
123        citation_relationship["LCS"] = lcs_field
124        return citation_relationship

Return citation_relationship dataframe.

class ReadFile:
182class ReadFile:
183    """Read files in a folder and return a dataframe.
184
185    Attributes:
186        folder_path: Path of a folder.
187        source_type: Type of source, `wos`, `cssci` and `scopus`.
188        file_path_list: List of valid file path. Ignore the file which doesn't comply with naming rules.
189    """
190
191    def __init__(
192        self, folder_path: str, source_type: Literal["wos", "cssci", "scopus"]
193    ):
194        """
195        Args:
196            folder_path: Path of a folder.
197            source_type: Type of source, `wos`, `cssci` and `scopus`.
198        """
199        self.folder_path: str = folder_path
200        self.source_type: Literal["wos", "cssci", "scopus"] = source_type
201        self.file_path_list: list[str] = self._obrain_file_path_list()
202
203    def _obrain_file_path_list(self) -> list[str]:
204        if self.source_type == "wos":
205            file_name_list = [
206                i for i in os.listdir(self.folder_path) if i[:9] == "savedrecs"
207            ]
208        elif self.source_type == "cssci":
209            file_name_list = [i for i in os.listdir(self.folder_path) if i[:3] == "LY_"]
210        elif self.source_type == "scopus":
211            file_name_list = [
212                i for i in os.listdir(self.folder_path) if i[:6] == "scopus"
213            ]
214        else:
215            raise ValueError("Invalid source type")
216        file_name_list.sort()
217        return [
218            os.path.join(self.folder_path, file_name) for file_name in file_name_list
219        ]
220
221    def _concat_df(self, read_file_func: Callable[[str], pd.DataFrame]) -> pd.DataFrame:
222        file_count = len(self.file_path_list)
223        if file_count > 1:
224            return pd.concat(
225                [read_file_func(file_path) for file_path in self.file_path_list],
226                ignore_index=True,
227                copy=False,
228            )
229        elif file_count == 1:
230            return read_file_func(self.file_path_list[0])
231        else:
232            raise FileNotFoundError("No valid file in the folder")
233
234    def read_all(self) -> pd.DataFrame:
235        """Concat multi dataframe and drop duplicate rows.
236
237        if wos, drop duplicate rows by `UT`.
238
239        if cssci, drop duplicate rows by `TI` and `First_AU`.
240
241        if scopus, drop duplicate rows by `EID`.
242        """
243        if self.source_type == "wos":
244            docs_df = self._concat_df(ReadWosFile.read_wos_file)
245        elif self.source_type == "cssci":
246            docs_df = self._concat_df(ReadCssciFile.read_cssci_file)
247        elif self.source_type == "scopus":
248            docs_df = self._concat_df(ReadScopusFile.read_scopus_file)
249        else:
250            raise ValueError("Invalid source type")
251
252        # Drop duplicate rows
253        original_num = docs_df.shape[0]
254        if self.source_type == "wos":
255            check_cols = ["UT"]
256        elif self.source_type == "cssci":
257            check_cols = ["TI", "First_AU"]
258        elif self.source_type == "scopus":
259            check_cols = ["EID"]
260        else:
261            raise ValueError("Invalid source type")
262        docs_df.drop_duplicates(subset=check_cols, ignore_index=True, inplace=True)
263        current_num = docs_df.shape[0]
264        print(f"共读取 {original_num} 条数据,去重后剩余 {current_num} 条")
265        docs_df.insert(0, "doc_index", docs_df.index)
266        return docs_df

Read files in a folder and return a dataframe.

Attributes:
  • folder_path: Path of a folder.
  • source_type: Type of source, wos, cssci and scopus.
  • file_path_list: List of valid file path. Ignore the file which doesn't comply with naming rules.
ReadFile(folder_path: str, source_type: Literal['wos', 'cssci', 'scopus'])
191    def __init__(
192        self, folder_path: str, source_type: Literal["wos", "cssci", "scopus"]
193    ):
194        """
195        Args:
196            folder_path: Path of a folder.
197            source_type: Type of source, `wos`, `cssci` and `scopus`.
198        """
199        self.folder_path: str = folder_path
200        self.source_type: Literal["wos", "cssci", "scopus"] = source_type
201        self.file_path_list: list[str] = self._obrain_file_path_list()
Arguments:
  • folder_path: Path of a folder.
  • source_type: Type of source, wos, cssci and scopus.
folder_path: str
source_type: Literal['wos', 'cssci', 'scopus']
file_path_list: list[str]
def read_all(self) -> pandas.core.frame.DataFrame:
234    def read_all(self) -> pd.DataFrame:
235        """Concat multi dataframe and drop duplicate rows.
236
237        if wos, drop duplicate rows by `UT`.
238
239        if cssci, drop duplicate rows by `TI` and `First_AU`.
240
241        if scopus, drop duplicate rows by `EID`.
242        """
243        if self.source_type == "wos":
244            docs_df = self._concat_df(ReadWosFile.read_wos_file)
245        elif self.source_type == "cssci":
246            docs_df = self._concat_df(ReadCssciFile.read_cssci_file)
247        elif self.source_type == "scopus":
248            docs_df = self._concat_df(ReadScopusFile.read_scopus_file)
249        else:
250            raise ValueError("Invalid source type")
251
252        # Drop duplicate rows
253        original_num = docs_df.shape[0]
254        if self.source_type == "wos":
255            check_cols = ["UT"]
256        elif self.source_type == "cssci":
257            check_cols = ["TI", "First_AU"]
258        elif self.source_type == "scopus":
259            check_cols = ["EID"]
260        else:
261            raise ValueError("Invalid source type")
262        docs_df.drop_duplicates(subset=check_cols, ignore_index=True, inplace=True)
263        current_num = docs_df.shape[0]
264        print(f"共读取 {original_num} 条数据,去重后剩余 {current_num} 条")
265        docs_df.insert(0, "doc_index", docs_df.index)
266        return docs_df

Concat multi dataframe and drop duplicate rows.

if wos, drop duplicate rows by UT.

if cssci, drop duplicate rows by TI and First_AU.

if scopus, drop duplicate rows by EID.

class ReadWosFile:
32class ReadWosFile:
33    @staticmethod
34    def _extract_first_author(au_field: pd.Series) -> pd.Series:
35        return au_field.str.split(pat=";", n=1, expand=True)[0].str.replace(",", "")
36
37    @staticmethod
38    def read_wos_file(file_path: str) -> pd.DataFrame:
39        """Read Web of Science file and return dataframe.
40
41        Args:
42            file_path: Path of a Web of Science file. File name is similar to `savedrecs.txt`.
43        """
44        use_cols = [
45            "AU",
46            "TI",
47            "SO",
48            "DT",
49            "CR",
50            "DE",
51            "C3",
52            "NR",
53            "TC",
54            "J9",
55            "PY",
56            "VL",
57            "BP",
58            "DI",
59            "UT",
60        ]
61        df = read_csv_file(file_path, use_cols, "\t")
62        df.insert(1, "First_AU", ReadWosFile._extract_first_author(df["AU"]))
63        df["source file"] = os.path.basename(file_path)
64        return df
@staticmethod
def read_wos_file(file_path: str) -> pandas.core.frame.DataFrame:
37    @staticmethod
38    def read_wos_file(file_path: str) -> pd.DataFrame:
39        """Read Web of Science file and return dataframe.
40
41        Args:
42            file_path: Path of a Web of Science file. File name is similar to `savedrecs.txt`.
43        """
44        use_cols = [
45            "AU",
46            "TI",
47            "SO",
48            "DT",
49            "CR",
50            "DE",
51            "C3",
52            "NR",
53            "TC",
54            "J9",
55            "PY",
56            "VL",
57            "BP",
58            "DI",
59            "UT",
60        ]
61        df = read_csv_file(file_path, use_cols, "\t")
62        df.insert(1, "First_AU", ReadWosFile._extract_first_author(df["AU"]))
63        df["source file"] = os.path.basename(file_path)
64        return df

Read Web of Science file and return dataframe.

Arguments:
  • file_path: Path of a Web of Science file. File name is similar to savedrecs.txt.
class ReadCssciFile:
 67class ReadCssciFile:
 68    @staticmethod
 69    def _extract_org(org_cell: str) -> str:
 70        org_set = set(re.findall(r"](.*?)(?:/|$)", org_cell))
 71        org_list = [i.replace(".", "") for i in org_set]
 72        return "; ".join(org_list)
 73
 74    @staticmethod
 75    def read_cssci_file(file_path: str) -> pd.DataFrame:
 76        """Read CSSCI file and return dataframe. Use `WOS` fields to replace original fields.
 77
 78        Args:
 79            file_path: Path of a CSSCI file. File name is similar to `LY_.txt`.
 80        """
 81        with open(file_path, "r") as f:
 82            text = f.read()
 83
 84        body_text = text.split("\n\n\n", 1)[1]
 85        contents = {}
 86        original_fields = [
 87            "来源篇名",
 88            "来源作者",
 89            "基    金",
 90            "期    刊",
 91            "机构名称",
 92            "第一作者",
 93            "年代卷期",
 94            "关 键 词",
 95            "参考文献",
 96        ]
 97        for field in original_fields:
 98            if field != "参考文献":
 99                field_pattern = f"【{field}】(.*?)\n"
100                contents[field] = re.findall(field_pattern, body_text)
101            else:
102                field_pattern = "【参考文献】\n(.*?)\n?" + "-" * 5
103                contents[field] = re.findall(field_pattern, body_text, flags=re.S)
104
105        df = pd.DataFrame.from_dict(contents)
106        # Rename columns
107        column_mapping = {
108            "来源篇名": "TI",
109            "来源作者": "AU",
110            "基    金": "FU",
111            "期    刊": "SO",
112            "机构名称": "C3",
113            "第一作者": "First_AU",
114            "年代卷期": "PY&VL&BP&EP",
115            "关 键 词": "DE",
116            "参考文献": "CR",
117        }
118        df.rename(columns=column_mapping, inplace=True)
119
120        df["AU"] = df["AU"].str.replace("/", "; ")
121        df["DE"] = df["DE"].str.replace("/", "; ")
122        df["PY"] = df["PY&VL&BP&EP"].str.extract(r"^(\d{4}),", expand=False)
123        df["C3"] = df["C3"].apply(ReadCssciFile._extract_org)
124        df["CR"] = df["CR"].str.replace("\n", "; ")
125        df["NR"] = df["CR"].str.count("; ")
126        df.insert(2, "First_AU", df.pop("First_AU"))
127        df["source file"] = os.path.basename(file_path)
128        return df
@staticmethod
def read_cssci_file(file_path: str) -> pandas.core.frame.DataFrame:
 74    @staticmethod
 75    def read_cssci_file(file_path: str) -> pd.DataFrame:
 76        """Read CSSCI file and return dataframe. Use `WOS` fields to replace original fields.
 77
 78        Args:
 79            file_path: Path of a CSSCI file. File name is similar to `LY_.txt`.
 80        """
 81        with open(file_path, "r") as f:
 82            text = f.read()
 83
 84        body_text = text.split("\n\n\n", 1)[1]
 85        contents = {}
 86        original_fields = [
 87            "来源篇名",
 88            "来源作者",
 89            "基    金",
 90            "期    刊",
 91            "机构名称",
 92            "第一作者",
 93            "年代卷期",
 94            "关 键 词",
 95            "参考文献",
 96        ]
 97        for field in original_fields:
 98            if field != "参考文献":
 99                field_pattern = f"【{field}】(.*?)\n"
100                contents[field] = re.findall(field_pattern, body_text)
101            else:
102                field_pattern = "【参考文献】\n(.*?)\n?" + "-" * 5
103                contents[field] = re.findall(field_pattern, body_text, flags=re.S)
104
105        df = pd.DataFrame.from_dict(contents)
106        # Rename columns
107        column_mapping = {
108            "来源篇名": "TI",
109            "来源作者": "AU",
110            "基    金": "FU",
111            "期    刊": "SO",
112            "机构名称": "C3",
113            "第一作者": "First_AU",
114            "年代卷期": "PY&VL&BP&EP",
115            "关 键 词": "DE",
116            "参考文献": "CR",
117        }
118        df.rename(columns=column_mapping, inplace=True)
119
120        df["AU"] = df["AU"].str.replace("/", "; ")
121        df["DE"] = df["DE"].str.replace("/", "; ")
122        df["PY"] = df["PY&VL&BP&EP"].str.extract(r"^(\d{4}),", expand=False)
123        df["C3"] = df["C3"].apply(ReadCssciFile._extract_org)
124        df["CR"] = df["CR"].str.replace("\n", "; ")
125        df["NR"] = df["CR"].str.count("; ")
126        df.insert(2, "First_AU", df.pop("First_AU"))
127        df["source file"] = os.path.basename(file_path)
128        return df

Read CSSCI file and return dataframe. Use WOS fields to replace original fields.

Arguments:
  • file_path: Path of a CSSCI file. File name is similar to LY_.txt.
class ReadScopusFile:
131class ReadScopusFile:
132    @staticmethod
133    def read_scopus_file(file_path: str) -> pd.DataFrame:
134        """Read Scopus file return dataframe. Use `WOS` fields to replace original fields.
135
136        Args:
137            file_path: Path of a Scopus file. File name is similar to `scopus.csv`.
138        """
139        use_cols = [
140            "Authors",
141            "Author full names",
142            "Title",
143            "Year",
144            "Source title",
145            "Volume",
146            "Issue",
147            "Page start",
148            "Page end",
149            "Cited by",
150            "DOI",
151            "Author Keywords",
152            "References",
153            "Document Type",
154            "EID",
155        ]
156
157        df = read_csv_file(file_path, use_cols)
158        # Rename columns
159        column_mapping = {
160            "Authors": "AU",
161            "Title": "TI",
162            "Year": "PY",
163            "Source title": "SO",
164            "Volume": "VL",
165            "Issue": "IS",
166            "Page start": "BP",
167            "Page end": "EP",
168            "Cited by": "TC",
169            "DOI": "DI",
170            "Author Keywords": "DE",
171            "References": "CR",
172            "Document Type": "DT",
173        }
174        df.rename(columns=column_mapping, inplace=True)
175
176        df["NR"] = df["CR"].str.count("; ")
177        df.insert(1, "First_AU", df["AU"].str.split(pat=";", n=1, expand=True)[0])
178        df["source file"] = os.path.basename(file_path)
179        return df
@staticmethod
def read_scopus_file(file_path: str) -> pandas.core.frame.DataFrame:
132    @staticmethod
133    def read_scopus_file(file_path: str) -> pd.DataFrame:
134        """Read Scopus file return dataframe. Use `WOS` fields to replace original fields.
135
136        Args:
137            file_path: Path of a Scopus file. File name is similar to `scopus.csv`.
138        """
139        use_cols = [
140            "Authors",
141            "Author full names",
142            "Title",
143            "Year",
144            "Source title",
145            "Volume",
146            "Issue",
147            "Page start",
148            "Page end",
149            "Cited by",
150            "DOI",
151            "Author Keywords",
152            "References",
153            "Document Type",
154            "EID",
155        ]
156
157        df = read_csv_file(file_path, use_cols)
158        # Rename columns
159        column_mapping = {
160            "Authors": "AU",
161            "Title": "TI",
162            "Year": "PY",
163            "Source title": "SO",
164            "Volume": "VL",
165            "Issue": "IS",
166            "Page start": "BP",
167            "Page end": "EP",
168            "Cited by": "TC",
169            "DOI": "DI",
170            "Author Keywords": "DE",
171            "References": "CR",
172            "Document Type": "DT",
173        }
174        df.rename(columns=column_mapping, inplace=True)
175
176        df["NR"] = df["CR"].str.count("; ")
177        df.insert(1, "First_AU", df["AU"].str.split(pat=";", n=1, expand=True)[0])
178        df["source file"] = os.path.basename(file_path)
179        return df

Read Scopus file return dataframe. Use WOS fields to replace original fields.

Arguments:
  • file_path: Path of a Scopus file. File name is similar to scopus.csv.
class RecognizeReference:
  7class RecognizeReference:
  8    @staticmethod
  9    def recognize_refs_factory(
 10        docs_df: pd.DataFrame,
 11        refs_df: pd.DataFrame,
 12        compare_cols: list[str],
 13        drop_duplicates: bool = False,
 14    ):
 15        """
 16        Recognize local references of a doc.
 17
 18        Args:
 19            docs_df: DataFrame of docs.
 20            refs_df: DataFrame of references.
 21            compare_cols: Columns to compare. e.g. `["First_AU", "TI"]`.
 22            drop_duplicates: Whether to drop duplicated rows with same values in `compare_cols`. Default is False.
 23
 24        Returns:
 25            Tuple of two Series, cited_refs_series and local_refs_series.
 26
 27            cited_refs_series: A Series of lists, each list contains the indexes of local references.
 28            local_refs_series: A Series of indexes of local references.
 29        """
 30        # Drop rows with missing values
 31        docs_df = docs_df.dropna(subset=compare_cols)
 32        refs_df = refs_df.dropna(subset=compare_cols)
 33
 34        if drop_duplicates is True:
 35            docs_df = docs_df.drop_duplicates(subset=compare_cols)
 36
 37        docs_df = docs_df[["doc_index"] + compare_cols]
 38        refs_df = refs_df[["doc_index", "ref_index"] + compare_cols]
 39        shared_df = pd.merge(
 40            refs_df, docs_df, how="left", on=compare_cols, suffixes=("_x", "_y")
 41        ).dropna(subset="doc_index_y")
 42        shared_df = shared_df.astype({"doc_index_y": "int64"})
 43        cited_refs_series = shared_df.groupby("doc_index_x")["doc_index_y"].apply(list)
 44        cited_refs_series = cited_refs_series.apply(lambda x: sorted(x))
 45        local_refs_series = shared_df["ref_index"].reset_index(drop=True)
 46        return cited_refs_series, local_refs_series
 47
 48    @staticmethod
 49    def recognize_wos_reference(docs_df: pd.DataFrame, refs_df: pd.DataFrame):
 50        """Recognize local references of a doc from Web of Science.
 51
 52        If `DOI` exists, use `DOI` to recognize references.
 53        Otherwise, use `First_AU`, `PY`, `J9`, `BP` to recognize references.
 54
 55        Args:
 56            docs_df: DataFrame of docs.
 57            refs_df: DataFrame of references.
 58
 59        Returns:
 60            Tuple of two Series, cited_refs_series and local_refs_series.
 61        """
 62
 63        def _merge_lists(list1: Optional[list[int]], list2: Optional[list[int]]):
 64            if isinstance(list1, list) and isinstance(list2, list):
 65                return list1 + list2
 66            else:
 67                if isinstance(list1, list):
 68                    return list1
 69                else:
 70                    return list2
 71
 72        # DOI exists
 73        compare_cols_doi = ["DI"]
 74        result_doi = RecognizeReference.recognize_refs_factory(
 75            docs_df, refs_df, compare_cols_doi
 76        )
 77
 78        # DOI not exists
 79        compare_cols = ["First_AU", "PY", "J9", "BP"]
 80        result = RecognizeReference.recognize_refs_factory(
 81            docs_df[docs_df["DI"].isna()], refs_df[refs_df["DI"].isna()], compare_cols
 82        )
 83        cited_refs_series = result_doi[0].combine(result[0], _merge_lists)
 84        local_refs_series = pd.concat([result_doi[1], result[1]])
 85        return cited_refs_series, local_refs_series
 86
 87    @staticmethod
 88    def recognize_cssci_reference(docs_df: pd.DataFrame, refs_df: pd.DataFrame):
 89        """Recognize local references of a doc from CSSCI.
 90
 91        Use `First_AU`, `TI` to recognize references.
 92
 93        Args:
 94            docs_df: DataFrame of docs.
 95            refs_df: DataFrame of references.
 96
 97        Returns:
 98            Tuple of two Series, cited_refs_series and local_refs_series.
 99        """
100        compare_cols = ["First_AU", "TI"]
101        return RecognizeReference.recognize_refs_factory(docs_df, refs_df, compare_cols)
102
103    @staticmethod
104    def recognize_scopus_reference(docs_df: pd.DataFrame, refs_df: pd.DataFrame):
105        """Recognize local references of a doc from Scopus.
106
107        Use `First_AU`, `TI` to recognize references.
108
109        Args:
110            docs_df: DataFrame of docs.
111            refs_df: DataFrame of references.
112
113        Returns:
114            Tuple of two Series, cited_refs_series and local_refs_series.
115        """
116        compare_cols = ["First_AU", "TI"]
117        return RecognizeReference.recognize_refs_factory(
118            docs_df, refs_df, compare_cols, drop_duplicates=True
119        )
@staticmethod
def recognize_refs_factory( docs_df: pandas.core.frame.DataFrame, refs_df: pandas.core.frame.DataFrame, compare_cols: list[str], drop_duplicates: bool = False):
 8    @staticmethod
 9    def recognize_refs_factory(
10        docs_df: pd.DataFrame,
11        refs_df: pd.DataFrame,
12        compare_cols: list[str],
13        drop_duplicates: bool = False,
14    ):
15        """
16        Recognize local references of a doc.
17
18        Args:
19            docs_df: DataFrame of docs.
20            refs_df: DataFrame of references.
21            compare_cols: Columns to compare. e.g. `["First_AU", "TI"]`.
22            drop_duplicates: Whether to drop duplicated rows with same values in `compare_cols`. Default is False.
23
24        Returns:
25            Tuple of two Series, cited_refs_series and local_refs_series.
26
27            cited_refs_series: A Series of lists, each list contains the indexes of local references.
28            local_refs_series: A Series of indexes of local references.
29        """
30        # Drop rows with missing values
31        docs_df = docs_df.dropna(subset=compare_cols)
32        refs_df = refs_df.dropna(subset=compare_cols)
33
34        if drop_duplicates is True:
35            docs_df = docs_df.drop_duplicates(subset=compare_cols)
36
37        docs_df = docs_df[["doc_index"] + compare_cols]
38        refs_df = refs_df[["doc_index", "ref_index"] + compare_cols]
39        shared_df = pd.merge(
40            refs_df, docs_df, how="left", on=compare_cols, suffixes=("_x", "_y")
41        ).dropna(subset="doc_index_y")
42        shared_df = shared_df.astype({"doc_index_y": "int64"})
43        cited_refs_series = shared_df.groupby("doc_index_x")["doc_index_y"].apply(list)
44        cited_refs_series = cited_refs_series.apply(lambda x: sorted(x))
45        local_refs_series = shared_df["ref_index"].reset_index(drop=True)
46        return cited_refs_series, local_refs_series

Recognize local references of a doc.

Arguments:
  • docs_df: DataFrame of docs.
  • refs_df: DataFrame of references.
  • compare_cols: Columns to compare. e.g. ["First_AU", "TI"].
  • drop_duplicates: Whether to drop duplicated rows with same values in compare_cols. Default is False.
Returns:

Tuple of two Series, cited_refs_series and local_refs_series.

cited_refs_series: A Series of lists, each list contains the indexes of local references. local_refs_series: A Series of indexes of local references.

@staticmethod
def recognize_wos_reference( docs_df: pandas.core.frame.DataFrame, refs_df: pandas.core.frame.DataFrame):
48    @staticmethod
49    def recognize_wos_reference(docs_df: pd.DataFrame, refs_df: pd.DataFrame):
50        """Recognize local references of a doc from Web of Science.
51
52        If `DOI` exists, use `DOI` to recognize references.
53        Otherwise, use `First_AU`, `PY`, `J9`, `BP` to recognize references.
54
55        Args:
56            docs_df: DataFrame of docs.
57            refs_df: DataFrame of references.
58
59        Returns:
60            Tuple of two Series, cited_refs_series and local_refs_series.
61        """
62
63        def _merge_lists(list1: Optional[list[int]], list2: Optional[list[int]]):
64            if isinstance(list1, list) and isinstance(list2, list):
65                return list1 + list2
66            else:
67                if isinstance(list1, list):
68                    return list1
69                else:
70                    return list2
71
72        # DOI exists
73        compare_cols_doi = ["DI"]
74        result_doi = RecognizeReference.recognize_refs_factory(
75            docs_df, refs_df, compare_cols_doi
76        )
77
78        # DOI not exists
79        compare_cols = ["First_AU", "PY", "J9", "BP"]
80        result = RecognizeReference.recognize_refs_factory(
81            docs_df[docs_df["DI"].isna()], refs_df[refs_df["DI"].isna()], compare_cols
82        )
83        cited_refs_series = result_doi[0].combine(result[0], _merge_lists)
84        local_refs_series = pd.concat([result_doi[1], result[1]])
85        return cited_refs_series, local_refs_series

Recognize local references of a doc from Web of Science.

If DOI exists, use DOI to recognize references. Otherwise, use First_AU, PY, J9, BP to recognize references.

Arguments:
  • docs_df: DataFrame of docs.
  • refs_df: DataFrame of references.
Returns:

Tuple of two Series, cited_refs_series and local_refs_series.

@staticmethod
def recognize_cssci_reference( docs_df: pandas.core.frame.DataFrame, refs_df: pandas.core.frame.DataFrame):
 87    @staticmethod
 88    def recognize_cssci_reference(docs_df: pd.DataFrame, refs_df: pd.DataFrame):
 89        """Recognize local references of a doc from CSSCI.
 90
 91        Use `First_AU`, `TI` to recognize references.
 92
 93        Args:
 94            docs_df: DataFrame of docs.
 95            refs_df: DataFrame of references.
 96
 97        Returns:
 98            Tuple of two Series, cited_refs_series and local_refs_series.
 99        """
100        compare_cols = ["First_AU", "TI"]
101        return RecognizeReference.recognize_refs_factory(docs_df, refs_df, compare_cols)

Recognize local references of a doc from CSSCI.

Use First_AU, TI to recognize references.

Arguments:
  • docs_df: DataFrame of docs.
  • refs_df: DataFrame of references.
Returns:

Tuple of two Series, cited_refs_series and local_refs_series.

@staticmethod
def recognize_scopus_reference( docs_df: pandas.core.frame.DataFrame, refs_df: pandas.core.frame.DataFrame):
103    @staticmethod
104    def recognize_scopus_reference(docs_df: pd.DataFrame, refs_df: pd.DataFrame):
105        """Recognize local references of a doc from Scopus.
106
107        Use `First_AU`, `TI` to recognize references.
108
109        Args:
110            docs_df: DataFrame of docs.
111            refs_df: DataFrame of references.
112
113        Returns:
114            Tuple of two Series, cited_refs_series and local_refs_series.
115        """
116        compare_cols = ["First_AU", "TI"]
117        return RecognizeReference.recognize_refs_factory(
118            docs_df, refs_df, compare_cols, drop_duplicates=True
119        )

Recognize local references of a doc from Scopus.

Use First_AU, TI to recognize references.

Arguments:
  • docs_df: DataFrame of docs.
  • refs_df: DataFrame of references.
Returns:

Tuple of two Series, cited_refs_series and local_refs_series.