现在的位置: 首页 > 综合 > 正文

读取英文单词,按顺序排列

2013年04月17日 ⁄ 综合 ⁄ 共 9483字 ⁄ 字号 评论关闭

1.     Implement a function that counting word frequency. It reads an English article from an user-specified txt file(article.txt) and counts their number. Those exclusive words should not be counted. Output the words and counts to 2 files. One(3_1_1out.txt) is in lexicographic order, and the other (3_1_2out.txt) is in descending frequency order.

#pragma warning(disable:4786)

 #include <string>

#include <iostream>

#include <fstream>

#include <functional>

#include <algorithm>

#include <vector>

#include <map>

#include <set>

 

using namespace std;

 

typedef map<string, int>::value_type sival_type;

 

vector<string> *retrieve_text(string file_name)

{

       ifstream artcile_file( file_name.c_str(), ios::in );

       if (!artcile_file) {

              cout << "Conn't open " << file_name.c_str() << " !" << endl;

              exit (1);

       }

       vector<string> *lines_of_text = new vector<string>;

       string textline;

 

       while ( getline(artcile_file, textline, '\n'))

       {

              //cout << "    " << textline << '\n';

              lines_of_text->push_back(textline);

       }

       return lines_of_text;

}

 

void strip_caps( vector<string> *text_file )

{

       string caps( "ABCDEFGHIJKLMNOPQRSTUVWXYZ" );

      

       vector<string>::iterator iter = text_file->begin();

       for ( ; iter != text_file->end(); ++iter )

       {

              string::size_type pos = 0;

              while ( (pos = (*iter).find_first_of( caps, pos ))

                     != string::npos )

              {

                     (*iter)[ pos ] = tolower( (*iter)[pos] );

              }

       } // end of for

}

 

vector<string> *separate_words( const vector<string> *text_file )

{

       string filter("abcdefghijklmnopqrstuvwxy");

      

       // 包含独立的单词集合

       vector<string> *words = new vector<string>;

      

       short line_pos = 0;

       for ( ; line_pos <text_file->size(); ++line_pos )

       {

              string textline = (*text_file)[line_pos];

             

              // 用来遍历所有的字母

              string::size_type pos      = 0;

              // 单词的开始位置

              string::size_type prev_pos = textline.find_first_of(filter);

              // 单词末尾的下一空格位置

              string::size_type temp_pos = textline.find_first_of(filter); 

             

              // 一个小开关,其值为TURE时,prev_pos指向单词开始的位置

              bool onoff = false;

             

              while ( (pos = textline.find_first_of(filter, pos))

                     != string::npos )

              {

                     if ( onoff )

                     {

                            prev_pos = temp_pos - 1;

                            // 将onoff值改为false,使单词开始的位置不会改变

                            onoff = false;

                     }

                    

                     ++pos;

                    

                     if ( (pos - temp_pos) != 1 )

                     {

                            // 为下一次的赋值做准备

                            onoff = true;

                            // 将分离出的单词输入words

                            words->push_back(

                                   textline.substr( prev_pos, temp_pos - prev_pos ));

                     }

                    

                     temp_pos = pos;

              }    // end of while

             

              // 输入最后一个单词,除非这一段没有找到任何字母

              if ( prev_pos != string::npos )

              {

                     words->push_back(

                            textline.substr( prev_pos, temp_pos - prev_pos ));

              }

       } // end of for

      

       return words;

}

 

map< string, int > *appear_total( const vector<string> *words )

{

       // 创建单词排除集合

       set<string> exclusion_set;

 

       ifstream exclusion_file( "pkg95.txt", ios::in );

       if (!exclusion_file) {

              cout << "Conn't open pkg95.txt !" << endl;

              exit (1);

       }

       string textline;

       while ( getline(exclusion_file, textline, '\n'))

       {

              //cout << "    " << textline << '\n';

              exclusion_set.insert(textline);

       }

 

       map<string, int> *word_map = new map<string, int>;

      

       // 开始向word_map中记录数据

       vector<string>::const_iterator iter = words->begin();

       for ( ; iter != words->end(); ++iter )

       {

              // 如果少于3个字符或在排除集合中存在,则不输入到map中

              if ( (*iter).size() < 3 || exclusion_set.count( *iter ) )

              {

                     continue;

              }

             

              // 如果count()返回0,则单词不存在,加入它

              if ( !word_map->count(*iter) )

              {

                     word_map->insert( sival_type( (*iter), 1 ) );

              }

              else

              {

                     // 将单词的出现次数加1

                     (*word_map)[ (*iter) ] += 1;

              }

       } // end of for

      

       return word_map;

}

 

multimap< int, string, greater<int> > * multimap_total( map<string, int> *text_map )

{

       multimap<int, string, greater<int> > *word_map =

              new multimap< int, string, greater<int> >;

       map< string, int >::iterator map_siter = text_map->begin();

 

       for ( ; map_siter != text_map->end(); ++map_siter )

       {

              word_map->insert(make_pair((*map_siter).second, (*map_siter).first));

       }

 

       {

              string ofile("3_1_2out.txt");

             

              ofstream outfile( ofile.c_str() );

              if (!outfile)

              {

                     cerr << "error: unable to open output file: "

                            << ofile << endl;

              }

             

              multimap< int, string, greater<int> >::iterator map_siter = word_map->begin();

             

              for ( ; map_siter != word_map->end(); ++map_siter )

              {

                     outfile << (*map_siter).second;

                    

                     for ( int n = 0; n < 15 - (*map_siter).second.size(); ++n )

                     {

                            outfile << ' ';

                     }

                    

                     outfile << "出现 " << (*map_siter).first << "\t次" << endl;

              } // end of for

             

              cout << "程序已将处理结果写入3_1_2out.txt,该文件保存在当前目录"

                     << endl;

       }

       return word_map;

}

 

void map_output( map<string, int> *text_map )

{

       string ofile("3_1_1out.txt");

      

       ofstream outfile( ofile.c_str() );

       if (!outfile)

       {

              cerr << "error: unable to open output file: "

                     << ofile << endl;

       }

      

       map< string, int >::iterator map_siter = text_map->begin();

 

       for ( ; map_siter != text_map->end(); ++map_siter )

       {

              outfile << (*map_siter).first;

             

              for ( int n = 0; n < 15 - (*map_siter).first.size(); ++n )

              {

                     outfile << ' ';

              }

             

              outfile << "出现 " << (*map_siter).second << "\t次" << endl;

       } // end of for

      

       cout << "程序已将处理结果写入3_1_1out.txt,该文件保存在当前目录"

              << endl;

}

 

int main()

{

      

       vector<string> *text_file = retrieve_text("article.txt");

       strip_caps(text_file);

       vector<string> *words = separate_words(text_file);

       map< string, int > *text_map = appear_total(words);

       map_output( text_map );

       multimap_total(text_map);

      

       return 0;

}

 

程序执行结果

3_1_1out.txt

article        出现 2 次

counted        出现 1       次

counting       出现 1       次

counts         出现 2       次

descending     出现 1       次

english        出现 1 次

exclusive      出现 1 次

file           出现 1 次

files          出现 1 次

frequency      出现 2 次

function       出现 1 次

implement      出现 1       次

lexicographic 出现 1 次

output         出现 1       次

reads          出现 1       次

specified      出现 1 次

txt            出现 4       次

user           出现 1       次

word           出现 1      次

words          出现 2      次

 

3_1_2out.txt

txt            出现 4       次

article        出现 2 次

counts         出现 2       次

frequency      出现 2 次

words          出现 2      次

counted        出现 1       次

counting       出现 1       次

descending     出现 1       次

english        出现 1 次

exclusive      出现 1 次

file           出现 1 次

files          出现 1 次

function       出现 1 次

implement      出现 1       次

lexicographic 出现 1 次

output         出现 1       次

reads          出现 1       次

specified      出现 1 次

user           出现 1       次

word           出现 1      次

 

附PKG95.TXT 文件内容:

different

necessary

need

needed

needing

newest

next

no

nobody

non

none

not

nothing

now

nowhere

of

off

often

new

old

older

oldest

on

once

one

only

open

again

among

already

about

above

against

alone

after

also

although

along

always

an

across

and

another

ask

asking

asks

backed

away

should

show

came

all

almost

before

began

back

backing

be

became

because

becomes

been

at

behind

being

best

better

between

big

showed

ended

ending

both

but

by

asked

backs

can

cannot

number

numbers

case

few

find

finds

cases

clearly

her

herself

come

could

did

here

beings

fact

far

felt

become

first

for

four

from

full

fully

furthers

gave

general

generally

get

gets

gives

facts

go

going

good

goods

certain

certainly

clear

great

greater

greatest

group

grouped

grouping

groups

got

has

have

having

he

further

furthered

had

furthering

itself

faces

highest

him

himself

his

how

however

if

important

interests

into

is

it

its

anyone

anything

anywhere

are

area

areas

around

as

seconds

see

seem

seemed

seeming

seems

sees

right

several

shall

she

enough

even

evenly

over

part

parted

parting

parts

per

down

place

places

point

pointed

pointing

points

possible

present

presented

presenting

ends

high

mrs

much

must

my

myself

presents

down

problem

problems

put

puts

quite

will

with

within

rather

really

room

rooms

said

same

right

showing

shows

side

sides

since

small

smaller

smallest

so

some

somebody

someone

something

somewhere

state

states

such

sure

take

taken

than

that

the

their

then

there

therefore

these

thought

thoughts

three

through

thus

to

today

together

too

took

toward

turn

turned

turning

turns

two

still

under

until

up

others

upon

us

use

used

uses

very

want

wanted

wanting

wants

was

way

we

well

wells

went

were

what

when

where

whether

which

while

who

whole

year

years

yet

you

everyone

everything

everywhere

young

younger

youngest

your

yours

ever

works

every

everybody

face

other

our

out

just

interesting

high

might

keep

keeps

give

given

higher

kind

knew

know

known

knows

large

largely

last

later

latest

least

less

needs

never

newer

let

lets

like

likely

long

high

longer

longest

made

make

making

man

many

may

me

member

members

men

more

in

interest

interested

most

mostly

mr

opened

opening

new

opens

or

perhaps

order

ordered

ordering

orders

differ

differently

do

does

done

downed

downing

downs

they

thing

things

think

thinks

this

those

ways

why

without

work

worked

working

would

during

each

early

either

end

though

still

whose

saw

say

says

them

second

any

anybody

抱歉!评论已关闭.